Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Upload 44 files
Browse files- Dockerfile +8 -8
- Dockerfile_GPU +37 -0
- README_zh.md +16 -14
- app.py +14 -28
- config.py +10 -4
- docker-compose-gpu.yaml +15 -0
- docker-compose.yaml +3 -1
- gunicorn_config.py +4 -0
- logger.py +42 -0
- requirements.txt +2 -1
- static/css/style.css +84 -0
- templates/index.html +267 -121
- text/cleaners.py +15 -0
- text/mandarin.py +2 -3
- utils/merge.py +16 -8
- utils/nlp.py +1 -7
- vits-simple-api-installer-latest.sh +26 -1
- voice.py +14 -15
    	
        Dockerfile
    CHANGED
    
    | @@ -6,15 +6,13 @@ WORKDIR /app | |
| 6 | 
             
            ENV DEBIAN_FRONTEND=noninteractive
         | 
| 7 |  | 
| 8 | 
             
            RUN apt-get update && \
         | 
| 9 | 
            -
                apt install build-essential - | 
| 10 | 
            -
                apt install espeak-ng -yq && \
         | 
| 11 | 
            -
                apt install cmake -yq && \
         | 
| 12 | 
            -
                apt install -y wget -yq && \
         | 
| 13 | 
             
                apt-get clean && \
         | 
| 14 | 
             
                apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
         | 
| 15 | 
             
                rm -rf /var/lib/apt/lists/*
         | 
| 16 |  | 
| 17 | 
            -
            RUN pip install  | 
|  | |
| 18 |  | 
| 19 | 
             
            RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openjtalk-0.3.0.dev2.tar.gz && \
         | 
| 20 | 
             
                tar -zxvf openjtalk-0.3.0.dev2.tar.gz && \
         | 
| @@ -25,13 +23,15 @@ RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openj | |
| 25 | 
             
                rm -f openjtalk-0.3.0.dev2.tar.gz && \
         | 
| 26 | 
             
                rm -rf openjtalk-0.3.0.dev2
         | 
| 27 |  | 
| 28 | 
            -
            RUN pip install torch --index-url https://download.pytorch.org/whl/cpu
         | 
| 29 |  | 
| 30 | 
             
            COPY requirements.txt /app
         | 
| 31 | 
            -
            RUN pip install -r requirements.txt
         | 
|  | |
|  | |
| 32 |  | 
| 33 | 
             
            COPY . /app
         | 
| 34 |  | 
| 35 | 
             
            EXPOSE 23456
         | 
| 36 |  | 
| 37 | 
            -
            CMD [" | 
|  | |
| 6 | 
             
            ENV DEBIAN_FRONTEND=noninteractive
         | 
| 7 |  | 
| 8 | 
             
            RUN apt-get update && \
         | 
| 9 | 
            +
                apt-get install -yq build-essential espeak-ng cmake wget && \
         | 
|  | |
|  | |
|  | |
| 10 | 
             
                apt-get clean && \
         | 
| 11 | 
             
                apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
         | 
| 12 | 
             
                rm -rf /var/lib/apt/lists/*
         | 
| 13 |  | 
| 14 | 
            +
            RUN pip install --upgrade pip --no-cache-dir && \
         | 
| 15 | 
            +
                pip install MarkupSafe==2.1.2 numpy==1.23.3 cython six==1.16.0 safetensors==0.3.2 --no-cache-dir
         | 
| 16 |  | 
| 17 | 
             
            RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openjtalk-0.3.0.dev2.tar.gz && \
         | 
| 18 | 
             
                tar -zxvf openjtalk-0.3.0.dev2.tar.gz && \
         | 
|  | |
| 23 | 
             
                rm -f openjtalk-0.3.0.dev2.tar.gz && \
         | 
| 24 | 
             
                rm -rf openjtalk-0.3.0.dev2
         | 
| 25 |  | 
| 26 | 
            +
            RUN pip install torch --index-url https://download.pytorch.org/whl/cpu --no-cache-dir
         | 
| 27 |  | 
| 28 | 
             
            COPY requirements.txt /app
         | 
| 29 | 
            +
            RUN pip install -r requirements.txt --no-cache-dir
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            RUN pip install gunicorn --no-cache-dir
         | 
| 32 |  | 
| 33 | 
             
            COPY . /app
         | 
| 34 |  | 
| 35 | 
             
            EXPOSE 23456
         | 
| 36 |  | 
| 37 | 
            +
            CMD ["gunicorn", "-c", "gunicorn_config.py", "app:app"]
         | 
    	
        Dockerfile_GPU
    ADDED
    
    | @@ -0,0 +1,37 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            FROM python:3.10.11-slim-bullseye
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            RUN mkdir -p /app
         | 
| 4 | 
            +
            WORKDIR /app
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            ENV DEBIAN_FRONTEND=noninteractive
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            RUN apt-get update && \
         | 
| 9 | 
            +
                apt-get install -yq build-essential espeak-ng cmake wget && \
         | 
| 10 | 
            +
                apt-get clean && \
         | 
| 11 | 
            +
                apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
         | 
| 12 | 
            +
                rm -rf /var/lib/apt/lists/*
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            RUN pip install --upgrade pip --no-cache-dir && \
         | 
| 15 | 
            +
                pip install MarkupSafe==2.1.2 numpy==1.23.3 cython six==1.16.0  safetensors==0.3.2 --no-cache-dir
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openjtalk-0.3.0.dev2.tar.gz && \
         | 
| 18 | 
            +
                tar -zxvf openjtalk-0.3.0.dev2.tar.gz && \
         | 
| 19 | 
            +
                cd openjtalk-0.3.0.dev2 && \
         | 
| 20 | 
            +
                rm -rf ./pyopenjtalk/open_jtalk_dic_utf_8-1.11 && \
         | 
| 21 | 
            +
                python setup.py install && \
         | 
| 22 | 
            +
                cd ../ && \
         | 
| 23 | 
            +
                rm -f openjtalk-0.3.0.dev2.tar.gz && \
         | 
| 24 | 
            +
                rm -rf openjtalk-0.3.0.dev2
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            RUN pip install torch --index-url https://download.pytorch.org/whl/cu117 --no-cache-dir
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            COPY requirements.txt /app
         | 
| 29 | 
            +
            RUN pip install -r requirements.txt --no-cache-dir
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            RUN pip install gunicorn --no-cache-dir
         | 
| 32 | 
            +
             | 
| 33 | 
            +
            COPY . /app
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            EXPOSE 23456
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            CMD ["gunicorn", "-c", "gunicorn_config.py", "app:app"]
         | 
    	
        README_zh.md
    CHANGED
    
    | @@ -63,7 +63,7 @@ | |
| 63 |  | 
| 64 |  | 
| 65 | 
             
            - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164`
         | 
| 66 | 
            -
            - `https://artrajz-vits-simple-api.hf.space/voice/vits?text | 
| 67 | 
             
            - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=Difficult the first time, easy the second.&id=4`
         | 
| 68 | 
             
            - 激动:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=111`
         | 
| 69 | 
             
            - 小声:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=2077`
         | 
| @@ -495,14 +495,15 @@ def voice_dimensional_emotion(upload_path): | |
| 495 |  | 
| 496 | 
             
            | Name          | Parameter | Is must | Default | Type  | Instruction                                                  |
         | 
| 497 | 
             
            | ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
         | 
| 498 | 
            -
            | 合成文本      | text      | true    |         | str   | | 
| 499 | 
            -
            | 角色id        | id        | false   | 0       | int   | | 
| 500 | 
             
            | 音频格式      | format    | false   | wav     | str   | 支持wav,ogg,silk,mp3,flac                                    |
         | 
| 501 | 
             
            | 文本语言      | lang      | false   | auto    | str   | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
         | 
| 502 | 
            -
            | 语音长度/语速 | length    | false   | 1.0     | float |  | 
| 503 | 
            -
            | 噪声          | noise     | false   | 0. | 
| 504 | 
            -
            |  | 
| 505 | 
             
            | 分段阈值      | max       | false   | 50      | int   | 按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。 |
         | 
|  | |
| 506 |  | 
| 507 | 
             
            ## VITS 语音转换
         | 
| 508 |  | 
| @@ -516,12 +517,12 @@ def voice_dimensional_emotion(upload_path): | |
| 516 |  | 
| 517 | 
             
            | Name          | Parameter | Is must | Default | Type  | Instruction                                      |
         | 
| 518 | 
             
            | ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------ |
         | 
| 519 | 
            -
            | 上传音频      | upload    | true    |         | file  | | 
| 520 | 
            -
            | 目标角色id    | id        | true    |         | int   | | 
| 521 | 
             
            | 音频格式      | format    | true    |         | str   | wav,ogg,silk                                     |
         | 
| 522 | 
             
            | 语音长度/语速 | length    | true    |         | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
         | 
| 523 | 
            -
            | 噪声          | noise     | true    |         | float | | 
| 524 | 
            -
            |  | 
| 525 |  | 
| 526 | 
             
            ## Dimensional emotion
         | 
| 527 |  | 
| @@ -533,13 +534,13 @@ def voice_dimensional_emotion(upload_path): | |
| 533 |  | 
| 534 | 
             
            | Name          | Parameter | Is must | Default | Type  | Instruction                                                  |
         | 
| 535 | 
             
            | ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
         | 
| 536 | 
            -
            |  | 
| 537 | 
            -
            | 角色id        | id        | false   | 0       | int   | | 
| 538 | 
             
            | 音频格式      | format    | false   | wav     | str   | 支持wav,ogg,silk,mp3,flac                                    |
         | 
| 539 | 
             
            | 文本语言      | lang      | false   | auto    | str   | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
         | 
| 540 | 
             
            | 语音长度/语速 | length    | false   | 1.0     | float | 调节语音长度,相当于调节语速,该数值越大语速越慢             |
         | 
| 541 | 
            -
            | 噪声          | noise     | false   | 0. | 
| 542 | 
            -
            |  | 
| 543 | 
             
            | 分段阈值      | max       | false   | 50      | int   | 按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。 |
         | 
| 544 | 
             
            | 维度情感      | emotion   | false   | 0       | int   | 范围取决于npy情感参考文件,如[innnky](https://huggingface.co/spaces/innnky/nene-emotion/tree/main)的all_emotions.npy模型范围是0-5457 |
         | 
| 545 |  | 
| @@ -623,4 +624,5 @@ def voice_dimensional_emotion(upload_path): | |
| 623 | 
             
            - MoeGoe:https://github.com/CjangCjengh/MoeGoe
         | 
| 624 | 
             
            - emotional-vits:https://github.com/innnky/emotional-vits
         | 
| 625 | 
             
            - vits-uma-genshin-honkai:https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai
         | 
|  | |
| 626 |  | 
|  | |
| 63 |  | 
| 64 |  | 
| 65 | 
             
            - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164`
         | 
| 66 | 
            +
            - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=我觉得1%2B1≠3&id=164&lang=zh`(get中一些字符需要转义不然会被过滤掉)
         | 
| 67 | 
             
            - `https://artrajz-vits-simple-api.hf.space/voice/vits?text=Difficult the first time, easy the second.&id=4`
         | 
| 68 | 
             
            - 激动:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=111`
         | 
| 69 | 
             
            - 小声:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=2077`
         | 
|  | |
| 495 |  | 
| 496 | 
             
            | Name          | Parameter | Is must | Default | Type  | Instruction                                                  |
         | 
| 497 | 
             
            | ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
         | 
| 498 | 
            +
            | 合成文本      | text      | true    |         | str   | 需要合成语音的文本。                                         |
         | 
| 499 | 
            +
            | 角色id        | id        | false   | 0       | int   | 即说话人id。                                                 |
         | 
| 500 | 
             
            | 音频格式      | format    | false   | wav     | str   | 支持wav,ogg,silk,mp3,flac                                    |
         | 
| 501 | 
             
            | 文本语言      | lang      | false   | auto    | str   | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
         | 
| 502 | 
            +
            | 语音长度/语速 | length    | false   | 1.0     | float | 调节语音长度,相当于调节语速,该数值越大语速越慢。           |
         | 
| 503 | 
            +
            | 噪声          | noise     | false   | 0.33    | float | 样本噪声,控制合成的随机性。                                 |
         | 
| 504 | 
            +
            | sdp噪声       | noisew    | false   | 0.4     | float | 随机时长预测器噪声,控制音素发音长度。                       |
         | 
| 505 | 
             
            | 分段阈值      | max       | false   | 50      | int   | 按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。 |
         | 
| 506 | 
            +
            | 流式响应      | streaming | false   | false   | bool  | 流式合成语音,更快的首包响应。                               |
         | 
| 507 |  | 
| 508 | 
             
            ## VITS 语音转换
         | 
| 509 |  | 
|  | |
| 517 |  | 
| 518 | 
             
            | Name          | Parameter | Is must | Default | Type  | Instruction                                      |
         | 
| 519 | 
             
            | ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------ |
         | 
| 520 | 
            +
            | 上传音频      | upload    | true    |         | file  | 需要转换说话人的音频文件。                       |
         | 
| 521 | 
            +
            | 目标角色id    | id        | true    |         | int   | 目标说话人id。                                   |
         | 
| 522 | 
             
            | 音频格式      | format    | true    |         | str   | wav,ogg,silk                                     |
         | 
| 523 | 
             
            | 语音长度/语速 | length    | true    |         | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
         | 
| 524 | 
            +
            | 噪声          | noise     | true    |         | float | 样本噪声,控制合成的随机性。                     |
         | 
| 525 | 
            +
            | sdp噪声       | noisew    | true    |         | float | 随机时长预测器噪声,控制音素发音长度。           |
         | 
| 526 |  | 
| 527 | 
             
            ## Dimensional emotion
         | 
| 528 |  | 
|  | |
| 534 |  | 
| 535 | 
             
            | Name          | Parameter | Is must | Default | Type  | Instruction                                                  |
         | 
| 536 | 
             
            | ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
         | 
| 537 | 
            +
            | 合���文本      | text      | true    |         | str   | 需要合成语音的文本。                                         |
         | 
| 538 | 
            +
            | 角色id        | id        | false   | 0       | int   | 即说话人id。                                                 |
         | 
| 539 | 
             
            | 音频格式      | format    | false   | wav     | str   | 支持wav,ogg,silk,mp3,flac                                    |
         | 
| 540 | 
             
            | 文本语言      | lang      | false   | auto    | str   | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
         | 
| 541 | 
             
            | 语音长度/语速 | length    | false   | 1.0     | float | 调节语音长度,相当于调节语速,该数值越大语速越慢             |
         | 
| 542 | 
            +
            | 噪声          | noise     | false   | 0.33    | float | 样本噪声,控制合成的随机性。                                 |
         | 
| 543 | 
            +
            | sdp噪声       | noisew    | false   | 0.4     | float | 随机时长预测器噪声,控制音素发音长度。                       |
         | 
| 544 | 
             
            | 分段阈值      | max       | false   | 50      | int   | 按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。 |
         | 
| 545 | 
             
            | 维度情感      | emotion   | false   | 0       | int   | 范围取决于npy情感参考文件,如[innnky](https://huggingface.co/spaces/innnky/nene-emotion/tree/main)的all_emotions.npy模型范围是0-5457 |
         | 
| 546 |  | 
|  | |
| 624 | 
             
            - MoeGoe:https://github.com/CjangCjengh/MoeGoe
         | 
| 625 | 
             
            - emotional-vits:https://github.com/innnky/emotional-vits
         | 
| 626 | 
             
            - vits-uma-genshin-honkai:https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai
         | 
| 627 | 
            +
            - vits_chinese:https://github.com/PlayVoice/vits_chinese
         | 
| 628 |  | 
    	
        app.py
    CHANGED
    
    | @@ -1,8 +1,7 @@ | |
| 1 | 
             
            import os
         | 
| 2 | 
            -
            import logging
         | 
| 3 | 
             
            import time
         | 
| 4 | 
            -
            import logzero
         | 
| 5 | 
             
            import uuid
         | 
|  | |
| 6 | 
             
            from flask import Flask, request, send_file, jsonify, make_response, render_template
         | 
| 7 | 
             
            from werkzeug.utils import secure_filename
         | 
| 8 | 
             
            from flask_apscheduler import APScheduler
         | 
| @@ -19,24 +18,15 @@ scheduler.init_app(app) | |
| 19 | 
             
            if app.config.get("CLEAN_INTERVAL_SECONDS", 3600) > 0:
         | 
| 20 | 
             
                scheduler.start()
         | 
| 21 |  | 
| 22 | 
            -
             | 
| 23 | 
            -
             | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
            -
            logging.basicConfig(level=level_dict[level])
         | 
| 28 | 
            -
            logging.getLogger('numba').setLevel(logging.WARNING)
         | 
| 29 | 
            -
            logging.getLogger("langid.langid").setLevel(logging.INFO)
         | 
| 30 | 
            -
            logging.getLogger("apscheduler.scheduler").setLevel(logging.INFO)
         | 
| 31 |  | 
|  | |
| 32 | 
             
            tts = merge_model(app.config["MODEL_LIST"])
         | 
| 33 |  | 
| 34 | 
            -
            if not os.path.exists(app.config['UPLOAD_FOLDER']):
         | 
| 35 | 
            -
                os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
         | 
| 36 | 
            -
             | 
| 37 | 
            -
            if not os.path.exists(app.config['CACHE_PATH']):
         | 
| 38 | 
            -
                os.makedirs(app.config['CACHE_PATH'], exist_ok=True)
         | 
| 39 | 
            -
             | 
| 40 |  | 
| 41 | 
             
            def require_api_key(func):
         | 
| 42 | 
             
                @wraps(func)
         | 
| @@ -57,7 +47,10 @@ def require_api_key(func): | |
| 57 | 
             
            def index():
         | 
| 58 | 
             
                kwargs = {
         | 
| 59 | 
             
                    "speakers": tts.voice_speakers,
         | 
| 60 | 
            -
                    "speakers_count": tts.speakers_count
         | 
|  | |
|  | |
|  | |
| 61 | 
             
                }
         | 
| 62 | 
             
                return render_template("index.html", **kwargs)
         | 
| 63 |  | 
| @@ -362,25 +355,18 @@ def ssml(): | |
| 362 | 
             
                    return make_response(jsonify({"status": "error", "message": f"parameter error"}), 400)
         | 
| 363 |  | 
| 364 | 
             
                logger.debug(ssml)
         | 
| 365 | 
            -
             | 
| 366 | 
             
                fname = f"{str(uuid.uuid1())}.{format}"
         | 
| 367 | 
             
                file_type = f"audio/{format}"
         | 
| 368 |  | 
| 369 | 
             
                t1 = time.time()
         | 
| 370 | 
            -
                audio | 
| 371 | 
             
                t2 = time.time()
         | 
| 372 | 
             
                if app.config.get("SAVE_AUDIO", False):
         | 
| 373 | 
             
                    logger.debug(f"[ssml] {fname}")
         | 
| 374 | 
             
                logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
         | 
| 375 |  | 
| 376 | 
            -
                 | 
| 377 | 
            -
                    audio = tts.generate_audio_chunks(audio)
         | 
| 378 | 
            -
                    response = make_response(audio)
         | 
| 379 | 
            -
                    response.headers['Content-Disposition'] = f'attachment; filename={fname}'
         | 
| 380 | 
            -
                    response.headers['Content-Type'] = file_type
         | 
| 381 | 
            -
                    return response
         | 
| 382 | 
            -
                else:
         | 
| 383 | 
            -
                    return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
         | 
| 384 |  | 
| 385 |  | 
| 386 | 
             
            @app.route('/voice/dimension-emotion', methods=["POST"])
         | 
|  | |
| 1 | 
             
            import os
         | 
|  | |
| 2 | 
             
            import time
         | 
|  | |
| 3 | 
             
            import uuid
         | 
| 4 | 
            +
            from logger import logger
         | 
| 5 | 
             
            from flask import Flask, request, send_file, jsonify, make_response, render_template
         | 
| 6 | 
             
            from werkzeug.utils import secure_filename
         | 
| 7 | 
             
            from flask_apscheduler import APScheduler
         | 
|  | |
| 18 | 
             
            if app.config.get("CLEAN_INTERVAL_SECONDS", 3600) > 0:
         | 
| 19 | 
             
                scheduler.start()
         | 
| 20 |  | 
| 21 | 
            +
            for path in (app.config['LOGS_PATH'], app.config['UPLOAD_FOLDER'], app.config['CACHE_PATH']):
         | 
| 22 | 
            +
                try:
         | 
| 23 | 
            +
                    os.makedirs(path, exist_ok=True)
         | 
| 24 | 
            +
                except Exception as e:
         | 
| 25 | 
            +
                    logger.error(f"Unable to create directory {path}: {str(e)}")
         | 
|  | |
|  | |
|  | |
|  | |
| 26 |  | 
| 27 | 
            +
            # load model
         | 
| 28 | 
             
            tts = merge_model(app.config["MODEL_LIST"])
         | 
| 29 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 30 |  | 
| 31 | 
             
            def require_api_key(func):
         | 
| 32 | 
             
                @wraps(func)
         | 
|  | |
| 47 | 
             
            def index():
         | 
| 48 | 
             
                kwargs = {
         | 
| 49 | 
             
                    "speakers": tts.voice_speakers,
         | 
| 50 | 
            +
                    "speakers_count": tts.speakers_count,
         | 
| 51 | 
            +
                    "vits_speakers_count":tts._vits_speakers_count,
         | 
| 52 | 
            +
                    "w2v2_speakers_count":tts._w2v2_speakers_count,
         | 
| 53 | 
            +
                    "w2v2_emotion_count":tts._w2v2_emotion_count
         | 
| 54 | 
             
                }
         | 
| 55 | 
             
                return render_template("index.html", **kwargs)
         | 
| 56 |  | 
|  | |
| 355 | 
             
                    return make_response(jsonify({"status": "error", "message": f"parameter error"}), 400)
         | 
| 356 |  | 
| 357 | 
             
                logger.debug(ssml)
         | 
| 358 | 
            +
                voice_tasks, format = tts.parse_ssml(ssml)
         | 
| 359 | 
             
                fname = f"{str(uuid.uuid1())}.{format}"
         | 
| 360 | 
             
                file_type = f"audio/{format}"
         | 
| 361 |  | 
| 362 | 
             
                t1 = time.time()
         | 
| 363 | 
            +
                audio = tts.create_ssml_infer_task(voice_tasks, format, fname)
         | 
| 364 | 
             
                t2 = time.time()
         | 
| 365 | 
             
                if app.config.get("SAVE_AUDIO", False):
         | 
| 366 | 
             
                    logger.debug(f"[ssml] {fname}")
         | 
| 367 | 
             
                logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
         | 
| 368 |  | 
| 369 | 
            +
                return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 370 |  | 
| 371 |  | 
| 372 | 
             
            @app.route('/voice/dimension-emotion', methods=["POST"])
         | 
    	
        config.py
    CHANGED
    
    | @@ -12,7 +12,7 @@ DEBUG = False | |
| 12 | 
             
            PORT = 7860
         | 
| 13 |  | 
| 14 | 
             
            # Absolute path of vits-simple-api
         | 
| 15 | 
            -
            ABS_PATH = os.path. | 
| 16 |  | 
| 17 | 
             
            # Upload path
         | 
| 18 | 
             
            UPLOAD_FOLDER = ABS_PATH + "/upload"
         | 
| @@ -20,6 +20,12 @@ UPLOAD_FOLDER = ABS_PATH + "/upload" | |
| 20 | 
             
            # Cahce path
         | 
| 21 | 
             
            CACHE_PATH = ABS_PATH + "/cache"
         | 
| 22 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 23 | 
             
            # If CLEAN_INTERVAL_SECONDS <= 0, the cleaning task will not be executed.
         | 
| 24 | 
             
            CLEAN_INTERVAL_SECONDS = 3600
         | 
| 25 |  | 
| @@ -39,7 +45,7 @@ API_KEY = "api-key" | |
| 39 | 
             
            LOGGING_LEVEL = "DEBUG"
         | 
| 40 |  | 
| 41 | 
             
            # Language identification library. Optional fastlid, langid
         | 
| 42 | 
            -
            LANGUAGE_IDENTIFICATION_LIBRARY = " | 
| 43 |  | 
| 44 | 
             
            # To use the english_cleaner, you need to install espeak and provide the path of libespeak-ng.dll as input here.
         | 
| 45 | 
             
            # If ESPEAK_LIBRARY is set to empty, it will be read from the environment variable.
         | 
| @@ -48,7 +54,7 @@ ESPEAK_LIBRARY = "" | |
| 48 |  | 
| 49 | 
             
            # Fill in the model path here
         | 
| 50 | 
             
            MODEL_LIST = [
         | 
| 51 | 
            -
             | 
| 52 | 
             
                [ABS_PATH + "/Model/Nene_Nanami_Rong_Tang/1374_epochs.pth", ABS_PATH + "/Model/Nene_Nanami_Rong_Tang/config.json"],
         | 
| 53 | 
             
                [ABS_PATH + "/Model/vctk/pretrained_vctk.pth", ABS_PATH + "/Model/vctk/vctk_base.json"],
         | 
| 54 | 
             
                [ABS_PATH + "/Model/paimon/paimon6k_390000.pth", ABS_PATH + "/Model/paimon/paimon6k.json"],
         | 
| @@ -73,7 +79,7 @@ HUBERT_SOFT_MODEL = ABS_PATH + "/Model/hubert-soft-0d54a1f4.pt" | |
| 73 | 
             
            DIMENSIONAL_EMOTION_NPY = ABS_PATH + "/Model/npy"
         | 
| 74 |  | 
| 75 | 
             
            # w2v2-vits: Need to have both `model.onnx` and `model.yaml` files in the same path.
         | 
| 76 | 
            -
            DIMENSIONAL_EMOTION_MODEL = ABS_PATH + "/Model/model.yaml"
         | 
| 77 |  | 
| 78 | 
             
            """
         | 
| 79 | 
             
            Default parameter
         | 
|  | |
| 12 | 
             
            PORT = 7860
         | 
| 13 |  | 
| 14 | 
             
            # Absolute path of vits-simple-api
         | 
| 15 | 
            +
            ABS_PATH = os.path.dirname(os.path.realpath(__file__))
         | 
| 16 |  | 
| 17 | 
             
            # Upload path
         | 
| 18 | 
             
            UPLOAD_FOLDER = ABS_PATH + "/upload"
         | 
|  | |
| 20 | 
             
            # Cahce path
         | 
| 21 | 
             
            CACHE_PATH = ABS_PATH + "/cache"
         | 
| 22 |  | 
| 23 | 
            +
            # Logs path
         | 
| 24 | 
            +
            LOGS_PATH = ABS_PATH + "/logs"
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            # Set the number of backup log files to keep. 
         | 
| 27 | 
            +
            LOGS_BACKUPCOUNT = 30
         | 
| 28 | 
            +
             | 
| 29 | 
             
            # If CLEAN_INTERVAL_SECONDS <= 0, the cleaning task will not be executed.
         | 
| 30 | 
             
            CLEAN_INTERVAL_SECONDS = 3600
         | 
| 31 |  | 
|  | |
| 45 | 
             
            LOGGING_LEVEL = "DEBUG"
         | 
| 46 |  | 
| 47 | 
             
            # Language identification library. Optional fastlid, langid
         | 
| 48 | 
            +
            LANGUAGE_IDENTIFICATION_LIBRARY = "fastlid"
         | 
| 49 |  | 
| 50 | 
             
            # To use the english_cleaner, you need to install espeak and provide the path of libespeak-ng.dll as input here.
         | 
| 51 | 
             
            # If ESPEAK_LIBRARY is set to empty, it will be read from the environment variable.
         | 
|  | |
| 54 |  | 
| 55 | 
             
            # Fill in the model path here
         | 
| 56 | 
             
            MODEL_LIST = [
         | 
| 57 | 
            +
                    # VITS
         | 
| 58 | 
             
                [ABS_PATH + "/Model/Nene_Nanami_Rong_Tang/1374_epochs.pth", ABS_PATH + "/Model/Nene_Nanami_Rong_Tang/config.json"],
         | 
| 59 | 
             
                [ABS_PATH + "/Model/vctk/pretrained_vctk.pth", ABS_PATH + "/Model/vctk/vctk_base.json"],
         | 
| 60 | 
             
                [ABS_PATH + "/Model/paimon/paimon6k_390000.pth", ABS_PATH + "/Model/paimon/paimon6k.json"],
         | 
|  | |
| 79 | 
             
            DIMENSIONAL_EMOTION_NPY = ABS_PATH + "/Model/npy"
         | 
| 80 |  | 
| 81 | 
             
            # w2v2-vits: Need to have both `model.onnx` and `model.yaml` files in the same path.
         | 
| 82 | 
            +
            # DIMENSIONAL_EMOTION_MODEL = ABS_PATH + "/Model/model.yaml"
         | 
| 83 |  | 
| 84 | 
             
            """
         | 
| 85 | 
             
            Default parameter
         | 
    	
        docker-compose-gpu.yaml
    ADDED
    
    | @@ -0,0 +1,15 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version: '3.4'
         | 
| 2 | 
            +
            services:
         | 
| 3 | 
            +
              vits:
         | 
| 4 | 
            +
                image: artrajz/vits-simple-api:latest-gpu
         | 
| 5 | 
            +
                restart: always
         | 
| 6 | 
            +
                ports:
         | 
| 7 | 
            +
                  - 23456:23456
         | 
| 8 | 
            +
                environment:
         | 
| 9 | 
            +
                  LANG: 'C.UTF-8'
         | 
| 10 | 
            +
                  TZ: Asia/Shanghai #timezone
         | 
| 11 | 
            +
                volumes:
         | 
| 12 | 
            +
                  - ./Model:/app/Model # 挂载模型文件夹
         | 
| 13 | 
            +
                  - ./config.py:/app/config.py # 挂载配置文件
         | 
| 14 | 
            +
                  - ./logs:/app/logs # logging logs
         | 
| 15 | 
            +
                  - ./gunicorn_config.py:/app/gunicorn_config.py # gunicorn configuration
         | 
    	
        docker-compose.yaml
    CHANGED
    
    | @@ -10,4 +10,6 @@ services: | |
| 10 | 
             
                  TZ: Asia/Shanghai #timezone
         | 
| 11 | 
             
                volumes:
         | 
| 12 | 
             
                  - ./Model:/app/Model # 挂载模型文件夹
         | 
| 13 | 
            -
                  - ./config.py:/app/config.py # 挂载配置文件
         | 
|  | |
|  | 
|  | |
| 10 | 
             
                  TZ: Asia/Shanghai #timezone
         | 
| 11 | 
             
                volumes:
         | 
| 12 | 
             
                  - ./Model:/app/Model # 挂载模型文件夹
         | 
| 13 | 
            +
                  - ./config.py:/app/config.py # 挂载配置文件
         | 
| 14 | 
            +
                  - ./logs:/app/logs # logging logs
         | 
| 15 | 
            +
                  - ./gunicorn_config.py:/app/gunicorn_config.py # gunicorn configuration
         | 
    	
        gunicorn_config.py
    ADDED
    
    | @@ -0,0 +1,4 @@ | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import multiprocessing
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            bind = "0.0.0.0:23456"
         | 
| 4 | 
            +
            workers = multiprocessing.cpu_count()
         | 
    	
        logger.py
    ADDED
    
    | @@ -0,0 +1,42 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import os
         | 
| 2 | 
            +
            import sys
         | 
| 3 | 
            +
            import logging
         | 
| 4 | 
            +
            import logzero
         | 
| 5 | 
            +
            import config
         | 
| 6 | 
            +
            from logging.handlers import TimedRotatingFileHandler
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            logzero.loglevel(logging.WARNING)
         | 
| 9 | 
            +
            logger = logging.getLogger("vits-simple-api")
         | 
| 10 | 
            +
            level = getattr(config, "LOGGING_LEVEL", "DEBUG")
         | 
| 11 | 
            +
            level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
         | 
| 12 | 
            +
                          'CRITICAL': logging.CRITICAL}
         | 
| 13 | 
            +
            logging.basicConfig(level=level_dict[level])
         | 
| 14 | 
            +
            logging.getLogger('numba').setLevel(logging.WARNING)
         | 
| 15 | 
            +
            logging.getLogger("langid.langid").setLevel(logging.INFO)
         | 
| 16 | 
            +
            logging.getLogger("apscheduler.scheduler").setLevel(logging.INFO)
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            os.makedirs(config.LOGS_PATH, exist_ok=True)
         | 
| 19 | 
            +
            log_file = os.path.join(config.LOGS_PATH, 'latest.log')
         | 
| 20 | 
            +
            backup_count = getattr(config, "LOGS_BACKUPCOUNT", 30)
         | 
| 21 | 
            +
            handler = TimedRotatingFileHandler(log_file, when="midnight", interval=1, backupCount=backup_count, encoding='utf-8')
         | 
| 22 | 
            +
            handler.suffix = "%Y-%m-%d.log"
         | 
| 23 | 
            +
            formatter = logging.Formatter('%(levelname)s:%(name)s %(message)s')
         | 
| 24 | 
            +
            handler.setFormatter(formatter)
         | 
| 25 | 
            +
            logger.addHandler(handler)
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            logging.getLogger("werkzeug").addHandler(handler)
         | 
| 28 | 
            +
            logging.getLogger("apscheduler.scheduler").addHandler(handler)
         | 
| 29 | 
            +
             | 
| 30 | 
            +
             | 
| 31 | 
            +
            # Custom function to handle uncaught exceptions
         | 
| 32 | 
            +
            def handle_exception(exc_type, exc_value, exc_traceback):
         | 
| 33 | 
            +
                # If it's a keyboard interrupt, don't handle it, just return
         | 
| 34 | 
            +
                if issubclass(exc_type, KeyboardInterrupt):
         | 
| 35 | 
            +
                    sys.__excepthook__(exc_type, exc_value, exc_traceback)
         | 
| 36 | 
            +
                    return
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                logger.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))
         | 
| 39 | 
            +
             | 
| 40 | 
            +
             | 
| 41 | 
            +
            # Set the global exception handler in Python
         | 
| 42 | 
            +
            sys.excepthook = handle_exception
         | 
    	
        requirements.txt
    CHANGED
    
    | @@ -27,4 +27,5 @@ fasttext | |
| 27 | 
             
            fastlid
         | 
| 28 | 
             
            langid
         | 
| 29 | 
             
            phonemizer==3.2.1
         | 
| 30 | 
            -
            transformers
         | 
|  | 
|  | |
| 27 | 
             
            fastlid
         | 
| 28 | 
             
            langid
         | 
| 29 | 
             
            phonemizer==3.2.1
         | 
| 30 | 
            +
            transformers
         | 
| 31 | 
            +
            pydantic==1.10.6
         | 
    	
        static/css/style.css
    ADDED
    
    | @@ -0,0 +1,84 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            .main-container {
         | 
| 2 | 
            +
                position: relative;
         | 
| 3 | 
            +
                width: 100%;
         | 
| 4 | 
            +
                min-height: 300px;
         | 
| 5 | 
            +
            }
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            .container {
         | 
| 8 | 
            +
                width: 300px;
         | 
| 9 | 
            +
                position: relative;
         | 
| 10 | 
            +
            }
         | 
| 11 | 
            +
             | 
| 12 | 
            +
             | 
| 13 | 
            +
            /*tabs*/
         | 
| 14 | 
            +
            .tabs {
         | 
| 15 | 
            +
                display: flex;
         | 
| 16 | 
            +
                left: 0;
         | 
| 17 | 
            +
            }
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            .tab-button {
         | 
| 20 | 
            +
                display: inline-block;
         | 
| 21 | 
            +
                background-color: transparent;
         | 
| 22 | 
            +
                padding: 5px 10px;
         | 
| 23 | 
            +
                cursor: pointer;
         | 
| 24 | 
            +
                margin-bottom: -2px;
         | 
| 25 | 
            +
                border-top: 2px solid transparent;
         | 
| 26 | 
            +
                border-left: 2px solid transparent;
         | 
| 27 | 
            +
                border-right: 2px solid transparent;
         | 
| 28 | 
            +
                border-bottom: 0px;
         | 
| 29 | 
            +
                border-top-left-radius: 0.5rem;
         | 
| 30 | 
            +
                border-top-right-radius: 0.5rem;
         | 
| 31 | 
            +
                color: gray;
         | 
| 32 | 
            +
            }
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            .tab-button.active {
         | 
| 35 | 
            +
                background-color: white;
         | 
| 36 | 
            +
                border-top: 2px solid #dee2e6;
         | 
| 37 | 
            +
                border-left: 2px solid #dee2e6;
         | 
| 38 | 
            +
                border-right: 2px solid #dee2e6;
         | 
| 39 | 
            +
                color: black;
         | 
| 40 | 
            +
            }
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            /*content*/
         | 
| 43 | 
            +
             | 
| 44 | 
            +
            .content {
         | 
| 45 | 
            +
                border: gray;
         | 
| 46 | 
            +
                border-left-width: 2px;
         | 
| 47 | 
            +
            }
         | 
| 48 | 
            +
             | 
| 49 | 
            +
            .content-pane {
         | 
| 50 | 
            +
                display: none;
         | 
| 51 | 
            +
                padding: 20px;
         | 
| 52 | 
            +
            }
         | 
| 53 | 
            +
             | 
| 54 | 
            +
            .content-pane.active {
         | 
| 55 | 
            +
                display: flex;
         | 
| 56 | 
            +
                -ms-flex-wrap: wrap;
         | 
| 57 | 
            +
                flex-wrap: wrap;
         | 
| 58 | 
            +
            }
         | 
| 59 | 
            +
             | 
| 60 | 
            +
            *, :before, :after {
         | 
| 61 | 
            +
                box-sizing: border-box;
         | 
| 62 | 
            +
                border-width: 0;
         | 
| 63 | 
            +
                border-style: solid;
         | 
| 64 | 
            +
                border-color: #e5e7eb;
         | 
| 65 | 
            +
            }
         | 
| 66 | 
            +
             | 
| 67 | 
            +
             | 
| 68 | 
            +
            .flex {
         | 
| 69 | 
            +
                display: flex;
         | 
| 70 | 
            +
            }
         | 
| 71 | 
            +
             | 
| 72 | 
            +
            .border-transparent {
         | 
| 73 | 
            +
                border-color: transparent;
         | 
| 74 | 
            +
            }
         | 
| 75 | 
            +
             | 
| 76 | 
            +
            .border-b-2 {
         | 
| 77 | 
            +
                border-bottom: 2px solid #dee2e6;
         | 
| 78 | 
            +
            }
         | 
| 79 | 
            +
             | 
| 80 | 
            +
            .border-lr-2 {
         | 
| 81 | 
            +
                border-left: 2px solid #dee2e6;
         | 
| 82 | 
            +
                border-right: 2px solid #dee2e6;
         | 
| 83 | 
            +
            }
         | 
| 84 | 
            +
             | 
    	
        templates/index.html
    CHANGED
    
    | @@ -4,126 +4,230 @@ | |
| 4 | 
             
                <meta charset="UTF-8"/>
         | 
| 5 | 
             
                <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
         | 
| 6 | 
             
                <title>vits-simple-api</title>
         | 
| 7 | 
            -
             | 
| 8 | 
             
                <link rel="stylesheet" href="/static/css/bootstrap.min.css"/>
         | 
| 9 | 
             
            </head>
         | 
| 10 | 
             
            <body>
         | 
| 11 | 
            -
            <main  | 
| 12 | 
            -
                < | 
| 13 | 
            -
                    < | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
|  | |
|  | |
|  | |
| 16 |  | 
| 17 | 
            -
                <div>
         | 
| 18 | 
            -
                    <label>文档:</label>
         | 
| 19 | 
            -
                    <a href="https://github.com/Artrajz/vits-simple-api" target="_blank"
         | 
| 20 | 
            -
                       style="text-decoration: none; color: black"> https://github.com/Artrajz/vits-simple-api </a>
         | 
| 21 | 
            -
                </div>
         | 
| 22 | 
            -
                <div>
         | 
| 23 | 
            -
                    <label>返回speakers(json):</label>
         | 
| 24 | 
            -
                    <a id="speakersLink" href="https://artrajz-vits-simple-api.hf.space/voice/speakers" target="_blank"
         | 
| 25 | 
            -
                       style="text-decoration: none; color: black">
         | 
| 26 | 
            -
                        https://artrajz-vits-simple-api.hf.space/voice/speakers
         | 
| 27 | 
            -
                    </a>
         | 
| 28 | 
            -
                </div>
         | 
| 29 | 
            -
                <div>
         | 
| 30 | 
            -
                    <label>简单调用api:</label>
         | 
| 31 | 
            -
                    <a id="vitsLink" href="https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164"
         | 
| 32 | 
            -
                       style="text-decoration: none; color: black">
         | 
| 33 | 
            -
                        https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164
         | 
| 34 | 
            -
                    </a>
         | 
| 35 | 
            -
                </div>
         | 
| 36 |  | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
                         | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
| 47 | 
            -
             | 
| 48 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 49 | 
             
                                        <option value="{{ speaker["id"] }}" selected>{{ speaker["id"] }} | {{ speaker["name"] }}
         | 
| 50 | 
             
                                            | {{ speaker["lang"] }}</option>
         | 
| 51 | 
             
                                    {% else %}
         | 
| 52 | 
             
                                        <option value="{{ speaker["id"] }}">{{ speaker["id"] }} | {{ speaker["name"] }}
         | 
| 53 | 
             
                                            | {{ speaker["lang"] }}</option>
         | 
| 54 | 
             
                                    {% endif %}
         | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
             | 
| 64 | 
            -
             | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
| 71 | 
            -
             | 
| 72 | 
            -
             | 
| 73 | 
            -
             | 
| 74 | 
            -
             | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
                                     | 
| 78 | 
            -
                                    < | 
| 79 | 
            -
             | 
| 80 | 
            -
             | 
| 81 | 
            -
             | 
| 82 | 
            -
             | 
| 83 | 
            -
             | 
| 84 | 
            -
                                 | 
| 85 | 
            -
             | 
| 86 | 
            -
             | 
| 87 | 
            -
             | 
| 88 | 
            -
             | 
| 89 | 
            -
             | 
| 90 | 
            -
             | 
| 91 | 
            -
             | 
| 92 | 
            -
             | 
| 93 | 
            -
             | 
| 94 | 
            -
             | 
| 95 | 
            -
             | 
| 96 | 
            -
             | 
| 97 | 
            -
             | 
| 98 | 
            -
             | 
| 99 | 
            -
             | 
| 100 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 101 | 
             
                            </div>
         | 
| 102 | 
            -
             | 
| 103 | 
            -
             | 
| 104 | 
            -
             | 
| 105 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 106 | 
             
                            </div>
         | 
| 107 | 
            -
                        </ | 
| 108 | 
             
                    </div>
         | 
| 109 | 
            -
                </div>
         | 
| 110 |  | 
| 111 | 
            -
             | 
| 112 | 
            -
             | 
| 113 | 
            -
             | 
| 114 | 
            -
                         | 
| 115 | 
            -
                         | 
| 116 | 
            -
             | 
| 117 | 
            -
             | 
| 118 | 
            -
             | 
| 119 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 120 | 
             
                    </div>
         | 
| 121 | 
            -
             | 
| 122 | 
            -
             | 
| 123 | 
            -
                <div>方言模型需要手动指定语言,比如粤语Cantonese要指定参数lang=gd</div>
         | 
| 124 | 
            -
                <br/>
         | 
| 125 | 
            -
             | 
| 126 | 
            -
                <h2>所有模型均为网络搜集,感谢模型原作者的付出!</h2>
         | 
| 127 | 
             
                <p>
         | 
| 128 | 
             
                    Nene_Nanami_Rong_Tang:
         | 
| 129 | 
             
                    <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
         | 
| @@ -164,6 +268,8 @@ | |
| 164 | 
             
                    vits_chinese:
         | 
| 165 | 
             
                    <a href="https://github.com/PlayVoice/vits_chinese" rel="noreferrer" target="_blank">PlayVoice/vits_chinese</a>
         | 
| 166 | 
             
                </p>
         | 
|  | |
|  | |
| 167 |  | 
| 168 | 
             
            </main>
         | 
| 169 |  | 
| @@ -171,6 +277,10 @@ | |
| 171 | 
             
            <script src="/static/js/bootstrap.bundle.min.js"></script>
         | 
| 172 |  | 
| 173 | 
             
            <script>
         | 
|  | |
|  | |
|  | |
|  | |
| 174 | 
             
                function getProtocol() {
         | 
| 175 | 
             
                    return 'https:' == location.protocol ? "https://" : "http://";
         | 
| 176 | 
             
                }
         | 
| @@ -181,12 +291,21 @@ | |
| 181 | 
             
                }
         | 
| 182 |  | 
| 183 | 
             
                var baseUrl = getProtocol() + getUrl();
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 184 |  | 
| 185 | 
             
                setBaseUrl();
         | 
| 186 |  | 
| 187 | 
             
                function setBaseUrl() {
         | 
| 188 | 
            -
                    var text = document.getElementById("inputText").value;
         | 
| 189 | 
            -
                    var id = document.getElementById("inputId").value;
         | 
| 190 |  | 
| 191 | 
             
                    var vitsLink = document.getElementById("vitsLink");
         | 
| 192 | 
             
                    var speakersLink = document.getElementById("speakersLink");
         | 
| @@ -202,17 +321,22 @@ | |
| 202 | 
             
                }
         | 
| 203 |  | 
| 204 | 
             
                function getLink() {
         | 
| 205 | 
            -
                    var text = document.getElementById("inputText").value;
         | 
| 206 | 
            -
                    var id = document.getElementById("inputId").value;
         | 
| 207 | 
            -
                    var format = document.getElementById("inputFormat").value;
         | 
| 208 | 
            -
                    var lang = document.getElementById("inputLang").value;
         | 
| 209 | 
            -
                    var length = document.getElementById("inputLength").value;
         | 
| 210 | 
            -
                    var noise = document.getElementById("inputNoise").value;
         | 
| 211 | 
            -
                    var noisew = document.getElementById("inputNoisew").value;
         | 
| 212 | 
            -
                    var max = document.getElementById("inputMax").value;
         | 
| 213 | 
            -
                    var streaming = document.getElementById('streaming');
         | 
| 214 |  | 
| 215 | 
            -
                     | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 216 | 
             
                    if (format != "") {
         | 
| 217 | 
             
                        url += "&format=" + format;
         | 
| 218 | 
             
                    }
         | 
| @@ -231,6 +355,7 @@ | |
| 231 | 
             
                    if (max != "") {
         | 
| 232 | 
             
                        url += "&max=" + max;
         | 
| 233 | 
             
                    }
         | 
|  | |
| 234 | 
             
                    if (streaming.checked) {
         | 
| 235 | 
             
                        url += '&streaming=true';
         | 
| 236 | 
             
                    }
         | 
| @@ -245,16 +370,37 @@ | |
| 245 | 
             
                }
         | 
| 246 |  | 
| 247 | 
             
                function setAudioSource() {
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 248 | 
             
                    var url = getLink();
         | 
| 249 | 
            -
                    var audioPlayer = document.getElementById("audioPlayer");
         | 
| 250 | 
             
                    audioPlayer.src = url;
         | 
| 251 | 
             
                    audioPlayer.play();
         | 
| 252 | 
             
                }
         | 
| 253 |  | 
| 254 | 
            -
                 | 
| 255 | 
            -
             | 
| 256 | 
            -
                     | 
| 257 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 258 | 
             
            </script>
         | 
| 259 | 
             
            </body>
         | 
| 260 | 
             
            </html>
         | 
|  | |
| 4 | 
             
                <meta charset="UTF-8"/>
         | 
| 5 | 
             
                <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
         | 
| 6 | 
             
                <title>vits-simple-api</title>
         | 
| 7 | 
            +
                <link rel="stylesheet" href="/static/css/style.css">
         | 
| 8 | 
             
                <link rel="stylesheet" href="/static/css/bootstrap.min.css"/>
         | 
| 9 | 
             
            </head>
         | 
| 10 | 
             
            <body>
         | 
| 11 | 
            +
            <main class="main-container">
         | 
| 12 | 
            +
                <div class="container flex flex-wrap mx-auto">
         | 
| 13 | 
            +
                    <div class="text-center d-flex align-items-center w-100" style="height: 100px;" id="component-1">
         | 
| 14 | 
            +
                        <h1 class="w-100">
         | 
| 15 | 
            +
                            <a href="https://github.com/Artrajz/vits-simple-api" target="_blank"
         | 
| 16 | 
            +
                               style="text-decoration: none; color: black"> vits-simple-api </a>
         | 
| 17 | 
            +
                        </h1>
         | 
| 18 | 
            +
                    </div>
         | 
| 19 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 20 |  | 
| 21 | 
            +
                    <div class="tabs w-100 border-b-2" id="component-2">
         | 
| 22 | 
            +
                        <button class="tab-button px-4 pb-2 pt-2 active " onclick="showContent(0)">VITS</button>
         | 
| 23 | 
            +
                        <button class="tab-button px-4 pb-2 pt-2" onclick="showContent(1)">W2V2-VITS</button>
         | 
| 24 | 
            +
                    </div>
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                    <div class="content w-100 border-lr-2 border-b-2" id="component-3">
         | 
| 27 | 
            +
                        <div class="content-pane active w-100 flex-wrap">
         | 
| 28 | 
            +
                            <form class="w-100">
         | 
| 29 | 
            +
                                <div class="form-group">
         | 
| 30 | 
            +
                                    <label>text</label>
         | 
| 31 | 
            +
                                    <textarea class="form-control" id="inputText1" rows="3"
         | 
| 32 | 
            +
                                              oninput="updateLink()">你好,こんにちは</textarea>
         | 
| 33 | 
            +
                                </div>
         | 
| 34 | 
            +
                                <div class="form-group">
         | 
| 35 | 
            +
                                    <label>id</label>
         | 
| 36 | 
            +
                                    <select class="form-control" id="inputId1" oninput="updateLink()">
         | 
| 37 | 
            +
                                        {% for speaker in speakers["VITS"] %}
         | 
| 38 | 
            +
                                            {% if speaker["name"] == "雷电将军(雷神)" %}
         | 
| 39 | 
             
                                        <option value="{{ speaker["id"] }}" selected>{{ speaker["id"] }} | {{ speaker["name"] }}
         | 
| 40 | 
             
                                            | {{ speaker["lang"] }}</option>
         | 
| 41 | 
             
                                    {% else %}
         | 
| 42 | 
             
                                        <option value="{{ speaker["id"] }}">{{ speaker["id"] }} | {{ speaker["name"] }}
         | 
| 43 | 
             
                                            | {{ speaker["lang"] }}</option>
         | 
| 44 | 
             
                                    {% endif %}
         | 
| 45 | 
            +
                                        {% endfor %}
         | 
| 46 | 
            +
                                    </select>
         | 
| 47 | 
            +
                                </div>
         | 
| 48 | 
            +
                            </form>
         | 
| 49 | 
            +
                            <form class="w-100">
         | 
| 50 | 
            +
                                <div class="row">
         | 
| 51 | 
            +
                                    <div class="col-md-4 form-group">
         | 
| 52 | 
            +
                                        <label data-toggle="tooltip" data-placement="top"
         | 
| 53 | 
            +
                                               title="默认为wav">format</label>
         | 
| 54 | 
            +
                                        <select class="form-control" id="inputFormat1" oninput="updateLink()">
         | 
| 55 | 
            +
                                            <option></option>
         | 
| 56 | 
            +
                                            <option>wav</option>
         | 
| 57 | 
            +
                                            <option>mp3</option>
         | 
| 58 | 
            +
                                            <option>ogg</option>
         | 
| 59 | 
            +
                                            <option>silk</option>
         | 
| 60 | 
            +
                                        </select>
         | 
| 61 | 
            +
                                    </div>
         | 
| 62 | 
            +
                                    <div class="col-md-4 form-group">
         | 
| 63 | 
            +
                                        <label data-toggle="tooltip" data-placement="top"
         | 
| 64 | 
            +
                                               title="自动识别语言auto:可识别的语言根据不同speaker而不同,方言无法自动识别。方言模型需要手动指定语言,比如粤语Cantonese要指定参数lang=gd">lang</label>
         | 
| 65 | 
            +
                                        <input type="text" class="form-control" id="inputLang1" oninput="updateLink()" value=""
         | 
| 66 | 
            +
                                               placeholder="auto"/>
         | 
| 67 | 
            +
                                    </div>
         | 
| 68 | 
            +
                                    <div class="col-md-4 form-group">
         | 
| 69 | 
            +
                                        <label data-toggle="tooltip" data-placement="top"
         | 
| 70 | 
            +
                                               title="调节语音长度,相当于调节语速,该数值越大语速越慢。">length</label>
         | 
| 71 | 
            +
                                        <input type="number" class="form-control" id="inputLength1" oninput="updateLink()" value=""
         | 
| 72 | 
            +
                                               placeholder="1" min="0" step="0.001"/>
         | 
| 73 | 
            +
                                    </div>
         | 
| 74 | 
            +
                                </div>
         | 
| 75 | 
            +
                                <div class="row">
         | 
| 76 | 
            +
                                    <div class="col-md-4 form-group">
         | 
| 77 | 
            +
                                        <label data-toggle="tooltip" data-placement="top"
         | 
| 78 | 
            +
                                               title="样本噪声,控制合成的随机性。">noise</label>
         | 
| 79 | 
            +
                                        <input type="number" class="form-control" id="inputNoise1" oninput="updateLink()" value=""
         | 
| 80 | 
            +
                                               placeholder="0.33" min="0" step="0.001"/>
         | 
| 81 | 
            +
                                    </div>
         | 
| 82 | 
            +
                                    <div class="col-md-4 form-group">
         | 
| 83 | 
            +
                                        <label data-toggle="tooltip" data-placement="top"
         | 
| 84 | 
            +
                                               title="随机时长预测器噪声,控制音素发音长度。">noisew</label>
         | 
| 85 | 
            +
                                        <input type="number" class="form-control" id="inputNoisew1" oninput="updateLink()" value=""
         | 
| 86 | 
            +
                                               placeholder="0.4" min="0" step="0.001"/>
         | 
| 87 | 
            +
                                    </div>
         | 
| 88 | 
            +
                                    <div class="col-md-4 form-group">
         | 
| 89 | 
            +
                                        <label data-toggle="tooltip" data-placement="top"
         | 
| 90 | 
            +
                                               title="按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。">max</label>
         | 
| 91 | 
            +
                                        <input type="number" class="form-control" id="inputMax1" oninput="updateLink()" value=""
         | 
| 92 | 
            +
                                               placeholder="50" step="1"/>
         | 
| 93 | 
            +
                                    </div>
         | 
| 94 | 
            +
                                </div>
         | 
| 95 | 
            +
                            </form>
         | 
| 96 | 
            +
             | 
| 97 | 
            +
             | 
| 98 | 
            +
                            <div class="flex flex-wrap  w-100"
         | 
| 99 | 
            +
                                 style="justify-content: center; align-items: center; height: 80px; margin-top: 20px; margin-bottom: 20px; border: 1px solid rgba(0,0,0,.125); border-radius: 0.25rem;">
         | 
| 100 | 
            +
                                <button type="button" class="btn btn-outline-secondary" onclick="setAudioSource()"
         | 
| 101 | 
            +
                                        style="margin-right: 10px">
         | 
| 102 | 
            +
                                    播放器生成
         | 
| 103 | 
            +
                                </button>
         | 
| 104 | 
            +
                                <audio id="audioPlayer1" controls>
         | 
| 105 | 
            +
                                    <source src="" type="audio/mp3"/>
         | 
| 106 | 
            +
                                    Your browser does not support the audio element.
         | 
| 107 | 
            +
                                </audio>
         | 
| 108 | 
            +
                                <div class="form-group form-check">
         | 
| 109 | 
            +
                                    <input type="checkbox" id="streaming1" onchange="updateLink()">
         | 
| 110 | 
            +
                                    <label class="form-check-label" data-toggle="tooltip" data-placement="top"
         | 
| 111 | 
            +
                                           title="按照max分段推理文本,推理好一段即输出,无需等待所有文本都推理完毕">流式响应</label>
         | 
| 112 | 
            +
                                </div>
         | 
| 113 | 
             
                            </div>
         | 
| 114 | 
            +
                        </div>
         | 
| 115 | 
            +
                        <div class="content-pane">
         | 
| 116 | 
            +
                            <form class="w-100">
         | 
| 117 | 
            +
                                <div class="form-group">
         | 
| 118 | 
            +
                                    <label>text</label>
         | 
| 119 | 
            +
                                    <textarea class="form-control" id="inputText2" rows="3"
         | 
| 120 | 
            +
                                              oninput="updateLink()">你好,こんにちは</textarea>
         | 
| 121 | 
            +
                                </div>
         | 
| 122 | 
            +
                                <div class="form-group">
         | 
| 123 | 
            +
                                    <label>id</label>
         | 
| 124 | 
            +
                                    <select class="form-control" id="inputId2" oninput="updateLink()">
         | 
| 125 | 
            +
                                        {% for speaker in speakers["W2V2-VITS"] %}
         | 
| 126 | 
            +
                                            <option value="{{ speaker["id"] }}">{{ speaker["id"] }} | {{ speaker["name"] }}
         | 
| 127 | 
            +
                                                | {{ speaker["lang"] }}</option>
         | 
| 128 | 
            +
                                        {% endfor %}
         | 
| 129 | 
            +
                                    </select>
         | 
| 130 | 
            +
                                </div>
         | 
| 131 | 
            +
                                <div class="form-group mb-3">
         | 
| 132 | 
            +
                                    <label data-toggle="tooltip" data-placement="top"
         | 
| 133 | 
            +
                                           title="情感嵌入,{% if w2v2_emotion_count > 0 %}
         | 
| 134 | 
            +
                                                可输入范围是0-{{ w2v2_emotion_count-1 }}
         | 
| 135 | 
            +
                                            {% else %}
         | 
| 136 | 
            +
                                                未加载emotion
         | 
| 137 | 
            +
                                            {% endif %}">emotion</label>
         | 
| 138 | 
            +
                                    <input type="number" class="form-control" min="0" max="{{ w2v2_emotion_count-1 }}" step="1"
         | 
| 139 | 
            +
                                           id="emotion" value="0" oninput="updateLink()">
         | 
| 140 | 
            +
                                </div>
         | 
| 141 | 
            +
                            </form>
         | 
| 142 | 
            +
             | 
| 143 | 
            +
             | 
| 144 | 
            +
                            <form class="w-100">
         | 
| 145 | 
            +
                                <div class="row">
         | 
| 146 | 
            +
                                    <div class="col-md-4 form-group">
         | 
| 147 | 
            +
                                        <label data-toggle="tooltip" data-placement="top"
         | 
| 148 | 
            +
                                               title="默认为wav">format</label>
         | 
| 149 | 
            +
                                        <select class="form-control" id="inputFormat2" oninput="updateLink()">
         | 
| 150 | 
            +
                                            <option></option>
         | 
| 151 | 
            +
                                            <option>wav</option>
         | 
| 152 | 
            +
                                            <option>mp3</option>
         | 
| 153 | 
            +
                                            <option>ogg</option>
         | 
| 154 | 
            +
                                            <option>silk</option>
         | 
| 155 | 
            +
                                        </select>
         | 
| 156 | 
            +
                                    </div>
         | 
| 157 | 
            +
                                    <div class="col-md-4 form-group">
         | 
| 158 | 
            +
                                        <label data-toggle="tooltip" data-placement="top"
         | 
| 159 | 
            +
                                               title="自动识别语言auto:可识别的语言根据不同speaker而不同,方言无法自动识别。方言模型需要手动指定语言,比如粤语Cantonese要指定参数lang=gd">lang</label>
         | 
| 160 | 
            +
                                        <input type="text" class="form-control" id="inputLang2" oninput="updateLink()" value=""
         | 
| 161 | 
            +
                                               placeholder="auto"/>
         | 
| 162 | 
            +
                                    </div>
         | 
| 163 | 
            +
                                    <div class="col-md-4 form-group">
         | 
| 164 | 
            +
                                        <label data-toggle="tooltip" data-placement="top"
         | 
| 165 | 
            +
                                               title="调节语音长度,相当于调节语速,该数值越大语速越慢。">length</label>
         | 
| 166 | 
            +
                                        <input type="number" class="form-control" id="inputLength2" oninput="updateLink()" value=""
         | 
| 167 | 
            +
                                               placeholder="1" min="0" step="0.001"/>
         | 
| 168 | 
            +
                                    </div>
         | 
| 169 | 
            +
                                </div>
         | 
| 170 | 
            +
                                <div class="row">
         | 
| 171 | 
            +
                                    <div class="col-md-4 form-group">
         | 
| 172 | 
            +
                                        <label data-toggle="tooltip" data-placement="top"
         | 
| 173 | 
            +
                                               title="样本噪声,控制合成的随机性。">noise</label>
         | 
| 174 | 
            +
                                        <input type="number" class="form-control" id="inputNoise2" oninput="updateLink()" value=""
         | 
| 175 | 
            +
                                               placeholder="0.33" min="0" step="0.001"/>
         | 
| 176 | 
            +
                                    </div>
         | 
| 177 | 
            +
                                    <div class="col-md-4 form-group">
         | 
| 178 | 
            +
                                        <label data-toggle="tooltip" data-placement="top"
         | 
| 179 | 
            +
                                               title="随机时长预测器噪声,控制音素发音长度。">noisew</label>
         | 
| 180 | 
            +
                                        <input type="number" class="form-control" id="inputNoisew2" oninput="updateLink()" value=""
         | 
| 181 | 
            +
                                               placeholder="0.4" min="0" step="0.001"/>
         | 
| 182 | 
            +
                                    </div>
         | 
| 183 | 
            +
                                    <div class="col-md-4 form-group">
         | 
| 184 | 
            +
                                        <label data-toggle="tooltip" data-placement="top"
         | 
| 185 | 
            +
                                               title="按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。">max</label>
         | 
| 186 | 
            +
                                        <input type="number" class="form-control" id="inputMax2" oninput="updateLink()" value=""
         | 
| 187 | 
            +
                                               placeholder="50" step="1"/>
         | 
| 188 | 
            +
                                    </div>
         | 
| 189 | 
            +
                                </div>
         | 
| 190 | 
            +
                            </form>
         | 
| 191 | 
            +
             | 
| 192 | 
            +
                            <div class="flex flex-wrap  w-100"
         | 
| 193 | 
            +
                                 style="justify-content: center; align-items: center; height: 80px; margin-top: 20px; margin-bottom: 20px; border: 1px solid rgba(0,0,0,.125); border-radius: 0.25rem;">
         | 
| 194 | 
            +
                                <button type="button" class="btn btn-outline-secondary" onclick="setAudioSource()"
         | 
| 195 | 
            +
                                        style="margin-right: 10px">
         | 
| 196 | 
            +
                                    播放器生成
         | 
| 197 | 
            +
                                </button>
         | 
| 198 | 
            +
                                <audio id="audioPlayer2" controls>
         | 
| 199 | 
            +
                                    <source src="" type="audio/mp3"/>
         | 
| 200 | 
            +
                                    Your browser does not support the audio element.
         | 
| 201 | 
            +
                                </audio>
         | 
| 202 | 
            +
                                <div class="form-group form-check">
         | 
| 203 | 
            +
                                    <input type="checkbox" id="streaming2" onchange="updateLink()">
         | 
| 204 | 
            +
                                    <label class="form-check-label">流式响应</label>
         | 
| 205 | 
            +
                                </div>
         | 
| 206 | 
             
                            </div>
         | 
| 207 | 
            +
                        </div>
         | 
| 208 | 
             
                    </div>
         | 
|  | |
| 209 |  | 
| 210 | 
            +
                    <div class="mt-2">
         | 
| 211 | 
            +
                        {% if speakers_count == 0 %}
         | 
| 212 | 
            +
                            <div style="color: red;">未加载任何模型</div>
         | 
| 213 | 
            +
                        {% endif %}
         | 
| 214 | 
            +
                        <div>
         | 
| 215 | 
            +
                            <label>返回speakers(json):</label>
         | 
| 216 | 
            +
                            <a id="speakersLink" href="https://artrajz-vits-simple-api.hf.space/voice/speakers" target="_blank"
         | 
| 217 | 
            +
                               style="text-decoration: none; color: black">
         | 
| 218 | 
            +
                                https://artrajz-vits-simple-api.hf.space/voice/speakers
         | 
| 219 | 
            +
                            </a>
         | 
| 220 | 
            +
                        </div>
         | 
| 221 | 
            +
                        <div>
         | 
| 222 | 
            +
                            <label>API调用:</label>
         | 
| 223 | 
            +
                            <a id="vitsLink" href="https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164"
         | 
| 224 | 
            +
                               style="text-decoration: none; color: black">
         | 
| 225 | 
            +
                                https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164
         | 
| 226 | 
            +
                            </a>
         | 
| 227 | 
            +
                        </div>
         | 
| 228 | 
             
                    </div>
         | 
| 229 | 
            +
            		<h2>所有模型均为网络搜集,感谢模型原作者的付出!</h2>
         | 
| 230 | 
            +
            		<h2>请严格遵循模型原作者使用协议!</h2>
         | 
|  | |
|  | |
|  | |
|  | |
| 231 | 
             
                <p>
         | 
| 232 | 
             
                    Nene_Nanami_Rong_Tang:
         | 
| 233 | 
             
                    <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
         | 
|  | |
| 268 | 
             
                    vits_chinese:
         | 
| 269 | 
             
                    <a href="https://github.com/PlayVoice/vits_chinese" rel="noreferrer" target="_blank">PlayVoice/vits_chinese</a>
         | 
| 270 | 
             
                </p>
         | 
| 271 | 
            +
                </div>
         | 
| 272 | 
            +
                <br/>
         | 
| 273 |  | 
| 274 | 
             
            </main>
         | 
| 275 |  | 
|  | |
| 277 | 
             
            <script src="/static/js/bootstrap.bundle.min.js"></script>
         | 
| 278 |  | 
| 279 | 
             
            <script>
         | 
| 280 | 
            +
                $(function () {
         | 
| 281 | 
            +
                    $('[data-toggle="tooltip"]').tooltip()
         | 
| 282 | 
            +
                })
         | 
| 283 | 
            +
             | 
| 284 | 
             
                function getProtocol() {
         | 
| 285 | 
             
                    return 'https:' == location.protocol ? "https://" : "http://";
         | 
| 286 | 
             
                }
         | 
|  | |
| 291 | 
             
                }
         | 
| 292 |  | 
| 293 | 
             
                var baseUrl = getProtocol() + getUrl();
         | 
| 294 | 
            +
                var modelType = 1;
         | 
| 295 | 
            +
                var vitsStatus = false;
         | 
| 296 | 
            +
                var w2v2Status = false;
         | 
| 297 | 
            +
                {% if vits_speakers_count > 0 %}
         | 
| 298 | 
            +
                    vitsStatus = true;
         | 
| 299 | 
            +
                {% endif %}
         | 
| 300 | 
            +
                {% if w2v2_speakers_count > 0 %}
         | 
| 301 | 
            +
                    w2v2Status = true;
         | 
| 302 | 
            +
                {% endif %}
         | 
| 303 |  | 
| 304 | 
             
                setBaseUrl();
         | 
| 305 |  | 
| 306 | 
             
                function setBaseUrl() {
         | 
| 307 | 
            +
                    var text = document.getElementById("inputText" + modelType).value;
         | 
| 308 | 
            +
                    var id = document.getElementById("inputId" + modelType).value;
         | 
| 309 |  | 
| 310 | 
             
                    var vitsLink = document.getElementById("vitsLink");
         | 
| 311 | 
             
                    var speakersLink = document.getElementById("speakersLink");
         | 
|  | |
| 321 | 
             
                }
         | 
| 322 |  | 
| 323 | 
             
                function getLink() {
         | 
| 324 | 
            +
                    var text = document.getElementById("inputText" + modelType).value;
         | 
| 325 | 
            +
                    var id = document.getElementById("inputId" + modelType).value;
         | 
| 326 | 
            +
                    var format = document.getElementById("inputFormat" + modelType).value;
         | 
| 327 | 
            +
                    var lang = document.getElementById("inputLang" + modelType).value;
         | 
| 328 | 
            +
                    var length = document.getElementById("inputLength" + modelType).value;
         | 
| 329 | 
            +
                    var noise = document.getElementById("inputNoise" + modelType).value;
         | 
| 330 | 
            +
                    var noisew = document.getElementById("inputNoisew" + modelType).value;
         | 
| 331 | 
            +
                    var max = document.getElementById("inputMax" + modelType).value;
         | 
| 332 | 
            +
                    var streaming = document.getElementById('streaming' + modelType);
         | 
| 333 |  | 
| 334 | 
            +
                    if (modelType == 1) {
         | 
| 335 | 
            +
                        var url = baseUrl + "/voice/vits?text=" + text + "&id=" + id;
         | 
| 336 | 
            +
                    } else if (modelType == 2) {
         | 
| 337 | 
            +
                        var emotion = document.getElementById('emotion').value;
         | 
| 338 | 
            +
                        var url = baseUrl + "/voice/w2v2-vits?text=" + text + "&id=" + id + "&emotion=" + emotion;
         | 
| 339 | 
            +
                    }
         | 
| 340 | 
             
                    if (format != "") {
         | 
| 341 | 
             
                        url += "&format=" + format;
         | 
| 342 | 
             
                    }
         | 
|  | |
| 355 | 
             
                    if (max != "") {
         | 
| 356 | 
             
                        url += "&max=" + max;
         | 
| 357 | 
             
                    }
         | 
| 358 | 
            +
             | 
| 359 | 
             
                    if (streaming.checked) {
         | 
| 360 | 
             
                        url += '&streaming=true';
         | 
| 361 | 
             
                    }
         | 
|  | |
| 370 | 
             
                }
         | 
| 371 |  | 
| 372 | 
             
                function setAudioSource() {
         | 
| 373 | 
            +
                    if (modelType==1 && !vitsStatus){
         | 
| 374 | 
            +
                        alert("未加载VITS模型");
         | 
| 375 | 
            +
                        return;
         | 
| 376 | 
            +
                    }
         | 
| 377 | 
            +
                    if (modelType==2 && !w2v2Status){
         | 
| 378 | 
            +
                        alert("未加载W2V2-VITS模型");
         | 
| 379 | 
            +
                        return;
         | 
| 380 | 
            +
                    }
         | 
| 381 | 
             
                    var url = getLink();
         | 
| 382 | 
            +
                    var audioPlayer = document.getElementById("audioPlayer" + modelType);
         | 
| 383 | 
             
                    audioPlayer.src = url;
         | 
| 384 | 
             
                    audioPlayer.play();
         | 
| 385 | 
             
                }
         | 
| 386 |  | 
| 387 | 
            +
                function showContent(index) {
         | 
| 388 | 
            +
                    const panes = document.querySelectorAll(".content-pane");
         | 
| 389 | 
            +
                    const buttons = document.querySelectorAll(".tab-button");
         | 
| 390 | 
            +
                    modelType = index + 1;
         | 
| 391 | 
            +
             | 
| 392 | 
            +
                    for (let i = 0; i < panes.length; i++) {
         | 
| 393 | 
            +
                        if (i === index) {
         | 
| 394 | 
            +
                            panes[i].classList.add("active");
         | 
| 395 | 
            +
                            buttons[i].classList.add("active");
         | 
| 396 | 
            +
             | 
| 397 | 
            +
                        } else {
         | 
| 398 | 
            +
                            panes[i].classList.remove("active");
         | 
| 399 | 
            +
                            buttons[i].classList.remove("active");
         | 
| 400 | 
            +
                        }
         | 
| 401 | 
            +
                    }
         | 
| 402 | 
            +
                    updateLink();
         | 
| 403 | 
            +
                }
         | 
| 404 | 
             
            </script>
         | 
| 405 | 
             
            </body>
         | 
| 406 | 
             
            </html>
         | 
    	
        text/cleaners.py
    CHANGED
    
    | @@ -186,6 +186,21 @@ def cjke_cleaners2(text): | |
| 186 |  | 
| 187 |  | 
| 188 | 
             
            def cje_cleaners(text):
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 189 | 
             
                from text.mandarin import chinese_to_ipa
         | 
| 190 | 
             
                from text.japanese import japanese_to_ipa2
         | 
| 191 | 
             
                from text.english import english_to_ipa2
         | 
|  | |
| 186 |  | 
| 187 |  | 
| 188 | 
             
            def cje_cleaners(text):
         | 
| 189 | 
            +
                from text.mandarin import chinese_to_lazy_ipa
         | 
| 190 | 
            +
                from text.japanese import japanese_to_ipa
         | 
| 191 | 
            +
                from text.english import english_to_ipa2
         | 
| 192 | 
            +
                text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
         | 
| 193 | 
            +
                    'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn') + ' ', text)
         | 
| 194 | 
            +
                text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
         | 
| 195 | 
            +
                    'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz') + ' ', text)
         | 
| 196 | 
            +
                text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
         | 
| 197 | 
            +
                    'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u') + ' ', text)
         | 
| 198 | 
            +
                text = re.sub(r'\s+$', '', text)
         | 
| 199 | 
            +
                text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
         | 
| 200 | 
            +
                return text
         | 
| 201 | 
            +
             | 
| 202 | 
            +
             | 
| 203 | 
            +
            def cje_cleaners2(text):
         | 
| 204 | 
             
                from text.mandarin import chinese_to_ipa
         | 
| 205 | 
             
                from text.japanese import japanese_to_ipa2
         | 
| 206 | 
             
                from text.english import english_to_ipa2
         | 
    	
        text/mandarin.py
    CHANGED
    
    | @@ -1,5 +1,4 @@ | |
| 1 | 
            -
            import  | 
| 2 | 
            -
            import sys
         | 
| 3 | 
             
            import re
         | 
| 4 | 
             
            from pypinyin import lazy_pinyin, BOPOMOFO
         | 
| 5 | 
             
            import jieba
         | 
| @@ -7,7 +6,7 @@ import cn2an | |
| 7 | 
             
            import logging
         | 
| 8 |  | 
| 9 | 
             
            logging.getLogger('jieba').setLevel(logging.WARNING)
         | 
| 10 | 
            -
            jieba.set_dictionary( | 
| 11 | 
             
            jieba.initialize()
         | 
| 12 |  | 
| 13 | 
             
            # List of (Latin alphabet, bopomofo) pairs:
         | 
|  | |
| 1 | 
            +
            import config
         | 
|  | |
| 2 | 
             
            import re
         | 
| 3 | 
             
            from pypinyin import lazy_pinyin, BOPOMOFO
         | 
| 4 | 
             
            import jieba
         | 
|  | |
| 6 | 
             
            import logging
         | 
| 7 |  | 
| 8 | 
             
            logging.getLogger('jieba').setLevel(logging.WARNING)
         | 
| 9 | 
            +
            jieba.set_dictionary(config.ABS_PATH + '/jieba/dict.txt')
         | 
| 10 | 
             
            jieba.initialize()
         | 
| 11 |  | 
| 12 | 
             
            # List of (Latin alphabet, bopomofo) pairs:
         | 
    	
        utils/merge.py
    CHANGED
    
    | @@ -19,12 +19,13 @@ lang_dict = { | |
| 19 | 
             
                "cjke_cleaners": ["zh", "ja", "ko", "en"],
         | 
| 20 | 
             
                "cjke_cleaners2": ["zh", "ja", "ko", "en"],
         | 
| 21 | 
             
                "cje_cleaners": ["zh", "ja", "en"],
         | 
|  | |
| 22 | 
             
                "thai_cleaners": ["th"],
         | 
| 23 | 
             
                "shanghainese_cleaners": ["sh"],
         | 
| 24 | 
             
                "chinese_dialect_cleaners": ["zh", "ja", "sh", "gd", "en", "SZ", "WX", "CZ", "HZ", "SX", "NB", "JJ", "YX", "JD",
         | 
| 25 | 
             
                                             "ZR", "PH", "TX", "JS", "HN", "LP", "XS", "FY", "RA", "CX", "SM", "TT", "WZ", "SC",
         | 
| 26 | 
             
                                             "YB"],
         | 
| 27 | 
            -
                "bert_chinese_cleaners":["zh"],
         | 
| 28 | 
             
            }
         | 
| 29 |  | 
| 30 |  | 
| @@ -109,11 +110,16 @@ def merge_model(merging_model): | |
| 109 | 
             
                for obj_id, i in enumerate(vits_list):
         | 
| 110 | 
             
                    obj = vits(model=i[0], config=i[1], model_type="vits")
         | 
| 111 | 
             
                    lang = lang_dict.get(obj.get_cleaner(), ["unknown"])
         | 
| 112 | 
            -
             | 
| 113 | 
            -
             | 
| 114 | 
            -
             | 
| 115 | 
            -
             | 
| 116 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 117 |  | 
| 118 | 
             
                # merge hubert-vits
         | 
| 119 | 
             
                if len(hubert_vits_list) != 0:
         | 
| @@ -136,6 +142,7 @@ def merge_model(merging_model): | |
| 136 | 
             
                        new_id += 1
         | 
| 137 |  | 
| 138 | 
             
                # merge w2v2-vits
         | 
|  | |
| 139 | 
             
                if len(w2v2_vits_list) != 0:
         | 
| 140 | 
             
                    if getattr(config, "DIMENSIONAL_EMOTION_NPY", None) == None or check_is_none(config.DIMENSIONAL_EMOTION_NPY):
         | 
| 141 | 
             
                        raise ValueError(f"Please configure DIMENSIONAL_EMOTION_NPY path in config.py")
         | 
| @@ -156,7 +163,8 @@ def merge_model(merging_model): | |
| 156 |  | 
| 157 | 
             
                voice_obj = {"VITS": vits_obj, "HUBERT-VITS": hubert_vits_obj, "W2V2-VITS": w2v2_vits_obj}
         | 
| 158 | 
             
                voice_speakers = {"VITS": vits_speakers, "HUBERT-VITS": hubert_vits_speakers, "W2V2-VITS": w2v2_vits_speakers}
         | 
| 159 | 
            -
             | 
| 160 | 
            -
                 | 
|  | |
| 161 |  | 
| 162 | 
             
                return tts
         | 
|  | |
| 19 | 
             
                "cjke_cleaners": ["zh", "ja", "ko", "en"],
         | 
| 20 | 
             
                "cjke_cleaners2": ["zh", "ja", "ko", "en"],
         | 
| 21 | 
             
                "cje_cleaners": ["zh", "ja", "en"],
         | 
| 22 | 
            +
                "cje_cleaners2": ["zh", "ja", "en"],
         | 
| 23 | 
             
                "thai_cleaners": ["th"],
         | 
| 24 | 
             
                "shanghainese_cleaners": ["sh"],
         | 
| 25 | 
             
                "chinese_dialect_cleaners": ["zh", "ja", "sh", "gd", "en", "SZ", "WX", "CZ", "HZ", "SX", "NB", "JJ", "YX", "JD",
         | 
| 26 | 
             
                                             "ZR", "PH", "TX", "JS", "HN", "LP", "XS", "FY", "RA", "CX", "SM", "TT", "WZ", "SC",
         | 
| 27 | 
             
                                             "YB"],
         | 
| 28 | 
            +
                "bert_chinese_cleaners": ["zh"],
         | 
| 29 | 
             
            }
         | 
| 30 |  | 
| 31 |  | 
|  | |
| 110 | 
             
                for obj_id, i in enumerate(vits_list):
         | 
| 111 | 
             
                    obj = vits(model=i[0], config=i[1], model_type="vits")
         | 
| 112 | 
             
                    lang = lang_dict.get(obj.get_cleaner(), ["unknown"])
         | 
| 113 | 
            +
                    if isinstance(obj.get_speakers(), list):
         | 
| 114 | 
            +
                        for id, name in enumerate(obj.get_speakers()):
         | 
| 115 | 
            +
                            vits_obj.append([int(id), obj, obj_id])
         | 
| 116 | 
            +
                            vits_speakers.append({"id": new_id, "name": name, "lang": lang})
         | 
| 117 | 
            +
                            new_id += 1
         | 
| 118 | 
            +
                    else:
         | 
| 119 | 
            +
                        for id, (name, _) in enumerate(obj.get_speakers().items()):
         | 
| 120 | 
            +
                            vits_obj.append([int(id), obj, obj_id])
         | 
| 121 | 
            +
                            vits_speakers.append({"id": new_id, "name": name, "lang": lang})
         | 
| 122 | 
            +
                            new_id += 1
         | 
| 123 |  | 
| 124 | 
             
                # merge hubert-vits
         | 
| 125 | 
             
                if len(hubert_vits_list) != 0:
         | 
|  | |
| 142 | 
             
                        new_id += 1
         | 
| 143 |  | 
| 144 | 
             
                # merge w2v2-vits
         | 
| 145 | 
            +
                emotion_reference = None
         | 
| 146 | 
             
                if len(w2v2_vits_list) != 0:
         | 
| 147 | 
             
                    if getattr(config, "DIMENSIONAL_EMOTION_NPY", None) == None or check_is_none(config.DIMENSIONAL_EMOTION_NPY):
         | 
| 148 | 
             
                        raise ValueError(f"Please configure DIMENSIONAL_EMOTION_NPY path in config.py")
         | 
|  | |
| 163 |  | 
| 164 | 
             
                voice_obj = {"VITS": vits_obj, "HUBERT-VITS": hubert_vits_obj, "W2V2-VITS": w2v2_vits_obj}
         | 
| 165 | 
             
                voice_speakers = {"VITS": vits_speakers, "HUBERT-VITS": hubert_vits_speakers, "W2V2-VITS": w2v2_vits_speakers}
         | 
| 166 | 
            +
                w2v2_emotion_count = len(emotion_reference) if emotion_reference is not None else 0
         | 
| 167 | 
            +
                
         | 
| 168 | 
            +
                tts = TTS(voice_obj, voice_speakers, w2v2_emotion_count=w2v2_emotion_count)
         | 
| 169 |  | 
| 170 | 
             
                return tts
         | 
    	
        utils/nlp.py
    CHANGED
    
    | @@ -1,13 +1,7 @@ | |
| 1 | 
             
            import regex as re
         | 
| 2 | 
            -
            import logging
         | 
| 3 | 
             
            import config
         | 
| 4 | 
             
            from .utils import check_is_none
         | 
| 5 | 
            -
             | 
| 6 | 
            -
            logger = logging.getLogger("vits-simple-api")
         | 
| 7 | 
            -
            level = getattr(config, "LOGGING_LEVEL", "DEBUG")
         | 
| 8 | 
            -
            level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
         | 
| 9 | 
            -
                          'CRITICAL': logging.CRITICAL}
         | 
| 10 | 
            -
            logger.setLevel(level_dict[level])
         | 
| 11 |  | 
| 12 |  | 
| 13 | 
             
            def clasify_lang(text, speaker_lang):
         | 
|  | |
| 1 | 
             
            import regex as re
         | 
|  | |
| 2 | 
             
            import config
         | 
| 3 | 
             
            from .utils import check_is_none
         | 
| 4 | 
            +
            from logger import logger
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 5 |  | 
| 6 |  | 
| 7 | 
             
            def clasify_lang(text, speaker_lang):
         | 
    	
        vits-simple-api-installer-latest.sh
    CHANGED
    
    | @@ -12,7 +12,32 @@ if [ ! -f config.py ]; then | |
| 12 | 
             
                wget -O $INSTALL_DIR/config.py https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/config.py
         | 
| 13 | 
             
            fi
         | 
| 14 |  | 
| 15 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 16 |  | 
| 17 | 
             
            echo -e "${YELLOW}Pulling the image might take a while, so why not grab a cup of java first?\n${PLAIN}"
         | 
| 18 |  | 
|  | |
| 12 | 
             
                wget -O $INSTALL_DIR/config.py https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/config.py
         | 
| 13 | 
             
            fi
         | 
| 14 |  | 
| 15 | 
            +
            if [ ! -f gunicorn_config.py ]; then
         | 
| 16 | 
            +
                echo -e "${YELLOW}download config.py\n${PLAIN}"
         | 
| 17 | 
            +
                wget -O $INSTALL_DIR/gunicorn_config.py https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/gunicorn_config.py
         | 
| 18 | 
            +
            fi
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            while true; do
         | 
| 21 | 
            +
                echo -e "${GREEN}Which version of docker-compose.yaml do you want to download?"
         | 
| 22 | 
            +
                echo -e "1. docker-compose.yaml (CPU version)"
         | 
| 23 | 
            +
                echo -e "2. docker-compose-gpu.yaml (GPU version)"
         | 
| 24 | 
            +
                read -p "Enter your choice (1 or 2): " choice
         | 
| 25 | 
            +
                case $choice in
         | 
| 26 | 
            +
                    1)
         | 
| 27 | 
            +
                        echo -e "${YELLOW}Downloading docker-compose.yaml (CPU version)\n${PLAIN}"
         | 
| 28 | 
            +
                        wget -O $INSTALL_DIR/docker-compose.yaml https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/docker-compose.yaml
         | 
| 29 | 
            +
                        break
         | 
| 30 | 
            +
                        ;;
         | 
| 31 | 
            +
                    2)
         | 
| 32 | 
            +
                        echo -e "${YELLOW}Downloading docker-compose-gpu.yaml (GPU version)\n${PLAIN}"
         | 
| 33 | 
            +
                        wget -O $INSTALL_DIR/docker-compose.yaml https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/docker-compose-gpu.yaml
         | 
| 34 | 
            +
                        break
         | 
| 35 | 
            +
                        ;;
         | 
| 36 | 
            +
                    *)
         | 
| 37 | 
            +
                        echo -e "${RED}Invalid choice. Please enter 1 or 2.${PLAIN}"
         | 
| 38 | 
            +
                        ;;
         | 
| 39 | 
            +
                esac
         | 
| 40 | 
            +
            done
         | 
| 41 |  | 
| 42 | 
             
            echo -e "${YELLOW}Pulling the image might take a while, so why not grab a cup of java first?\n${PLAIN}"
         | 
| 43 |  | 
    	
        voice.py
    CHANGED
    
    | @@ -6,7 +6,6 @@ import numpy as np | |
| 6 | 
             
            import torch
         | 
| 7 | 
             
            import xml.etree.ElementTree as ET
         | 
| 8 | 
             
            import config
         | 
| 9 | 
            -
            import logging
         | 
| 10 | 
             
            import soundfile as sf
         | 
| 11 | 
             
            from torch import no_grad, LongTensor, inference_mode, FloatTensor
         | 
| 12 | 
             
            from io import BytesIO
         | 
| @@ -16,6 +15,7 @@ from mel_processing import spectrogram_torch | |
| 16 | 
             
            from text import text_to_sequence
         | 
| 17 | 
             
            from models import SynthesizerTrn
         | 
| 18 | 
             
            from utils import utils
         | 
|  | |
| 19 |  | 
| 20 | 
             
            # torch.set_num_threads(1) # 设置torch线程为1
         | 
| 21 | 
             
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         | 
| @@ -251,7 +251,7 @@ class vits: | |
| 251 |  | 
| 252 |  | 
| 253 | 
             
            class TTS:
         | 
| 254 | 
            -
                def __init__(self, voice_obj, voice_speakers):
         | 
| 255 | 
             
                    self._voice_obj = voice_obj
         | 
| 256 | 
             
                    self._voice_speakers = voice_speakers
         | 
| 257 | 
             
                    self._strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
         | 
| @@ -259,10 +259,11 @@ class TTS: | |
| 259 | 
             
                    self._vits_speakers_count = len(self._voice_speakers["VITS"])
         | 
| 260 | 
             
                    self._hubert_speakers_count = len(self._voice_speakers["HUBERT-VITS"])
         | 
| 261 | 
             
                    self._w2v2_speakers_count = len(self._voice_speakers["W2V2-VITS"])
         | 
|  | |
| 262 | 
             
                    self.dem = None
         | 
| 263 |  | 
| 264 | 
             
                    # Initialization information
         | 
| 265 | 
            -
                    self.logger =  | 
| 266 | 
             
                    self.logger.info(f"torch:{torch.__version__} cuda_available:{torch.cuda.is_available()}")
         | 
| 267 | 
             
                    self.logger.info(f'device:{device} device.type:{device.type}')
         | 
| 268 |  | 
| @@ -420,9 +421,7 @@ class TTS: | |
| 420 |  | 
| 421 | 
             
                    return voice_tasks, format
         | 
| 422 |  | 
| 423 | 
            -
                def create_ssml_infer_task(self,  | 
| 424 | 
            -
                    voice_tasks, format = self.parse_ssml(ssml)
         | 
| 425 | 
            -
             | 
| 426 | 
             
                    audios = []
         | 
| 427 | 
             
                    for voice in voice_tasks:
         | 
| 428 | 
             
                        if voice.get("break"):
         | 
| @@ -438,10 +437,10 @@ class TTS: | |
| 438 |  | 
| 439 | 
             
                    audio = np.concatenate(audios, axis=0)
         | 
| 440 | 
             
                    encoded_audio = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
         | 
| 441 | 
            -
                    if config | 
| 442 | 
             
                        path = f"{config.CACHE_PATH}/{fname}"
         | 
| 443 | 
             
                        utils.save_audio(encoded_audio.getvalue(), path)
         | 
| 444 | 
            -
                    return encoded_audio | 
| 445 |  | 
| 446 | 
             
                def vits_infer(self, voice, fname):
         | 
| 447 | 
             
                    format = voice.get("format", "wav")
         | 
| @@ -450,7 +449,7 @@ class TTS: | |
| 450 | 
             
                    sampling_rate = voice_obj.hps_ms.data.sampling_rate
         | 
| 451 | 
             
                    audio = voice_obj.get_audio(voice, auto_break=True)
         | 
| 452 | 
             
                    encoded_audio = self.encode(sampling_rate, audio, format)
         | 
| 453 | 
            -
                    if config | 
| 454 | 
             
                        path = f"{config.CACHE_PATH}/{fname}"
         | 
| 455 | 
             
                        utils.save_audio(encoded_audio.getvalue(), path)
         | 
| 456 | 
             
                    return encoded_audio
         | 
| @@ -466,9 +465,9 @@ class TTS: | |
| 466 | 
             
                        encoded_audio = self.encode(sampling_rate, chunk, format)
         | 
| 467 | 
             
                        for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio):
         | 
| 468 | 
             
                            yield encoded_audio_chunk
         | 
| 469 | 
            -
                        if config | 
| 470 | 
            -
                            audio.write(encoded_audio.getvalue()) | 
| 471 | 
            -
                    if config | 
| 472 | 
             
                        path = f"{config.CACHE_PATH}/{fname}"
         | 
| 473 | 
             
                        utils.save_audio(audio.getvalue(), path)
         | 
| 474 |  | 
| @@ -479,7 +478,7 @@ class TTS: | |
| 479 | 
             
                    sampling_rate = voice_obj.hps_ms.data.sampling_rate
         | 
| 480 | 
             
                    audio = voice_obj.get_audio(voice)
         | 
| 481 | 
             
                    encoded_audio = self.encode(sampling_rate, audio, format)
         | 
| 482 | 
            -
                    if config | 
| 483 | 
             
                        path = f"{config.CACHE_PATH}/{fname}"
         | 
| 484 | 
             
                        utils.save_audio(encoded_audio.getvalue(), path)
         | 
| 485 | 
             
                    return encoded_audio
         | 
| @@ -491,7 +490,7 @@ class TTS: | |
| 491 | 
             
                    sampling_rate = voice_obj.hps_ms.data.sampling_rate
         | 
| 492 | 
             
                    audio = voice_obj.get_audio(voice, auto_break=True)
         | 
| 493 | 
             
                    encoded_audio = self.encode(sampling_rate, audio, format)
         | 
| 494 | 
            -
                    if config | 
| 495 | 
             
                        path = f"{config.CACHE_PATH}/{fname}"
         | 
| 496 | 
             
                        utils.save_audio(encoded_audio.getvalue(), path)
         | 
| 497 | 
             
                    return encoded_audio
         | 
| @@ -515,7 +514,7 @@ class TTS: | |
| 515 |  | 
| 516 | 
             
                    audio = voice_obj.voice_conversion(voice)
         | 
| 517 | 
             
                    encoded_audio = self.encode(sampling_rate, audio, format)
         | 
| 518 | 
            -
                    if config | 
| 519 | 
             
                        path = f"{config.CACHE_PATH}/{fname}"
         | 
| 520 | 
             
                        utils.save_audio(encoded_audio.getvalue(), path)
         | 
| 521 | 
             
                    return encoded_audio
         | 
|  | |
| 6 | 
             
            import torch
         | 
| 7 | 
             
            import xml.etree.ElementTree as ET
         | 
| 8 | 
             
            import config
         | 
|  | |
| 9 | 
             
            import soundfile as sf
         | 
| 10 | 
             
            from torch import no_grad, LongTensor, inference_mode, FloatTensor
         | 
| 11 | 
             
            from io import BytesIO
         | 
|  | |
| 15 | 
             
            from text import text_to_sequence
         | 
| 16 | 
             
            from models import SynthesizerTrn
         | 
| 17 | 
             
            from utils import utils
         | 
| 18 | 
            +
            from logger import logger
         | 
| 19 |  | 
| 20 | 
             
            # torch.set_num_threads(1) # 设置torch线程为1
         | 
| 21 | 
             
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         | 
|  | |
| 251 |  | 
| 252 |  | 
| 253 | 
             
            class TTS:
         | 
| 254 | 
            +
                def __init__(self, voice_obj, voice_speakers, w2v2_emotion_count=0):
         | 
| 255 | 
             
                    self._voice_obj = voice_obj
         | 
| 256 | 
             
                    self._voice_speakers = voice_speakers
         | 
| 257 | 
             
                    self._strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
         | 
|  | |
| 259 | 
             
                    self._vits_speakers_count = len(self._voice_speakers["VITS"])
         | 
| 260 | 
             
                    self._hubert_speakers_count = len(self._voice_speakers["HUBERT-VITS"])
         | 
| 261 | 
             
                    self._w2v2_speakers_count = len(self._voice_speakers["W2V2-VITS"])
         | 
| 262 | 
            +
                    self._w2v2_emotion_count = w2v2_emotion_count
         | 
| 263 | 
             
                    self.dem = None
         | 
| 264 |  | 
| 265 | 
             
                    # Initialization information
         | 
| 266 | 
            +
                    self.logger = logger
         | 
| 267 | 
             
                    self.logger.info(f"torch:{torch.__version__} cuda_available:{torch.cuda.is_available()}")
         | 
| 268 | 
             
                    self.logger.info(f'device:{device} device.type:{device.type}')
         | 
| 269 |  | 
|  | |
| 421 |  | 
| 422 | 
             
                    return voice_tasks, format
         | 
| 423 |  | 
| 424 | 
            +
                def create_ssml_infer_task(self, voice_tasks, format, fname):
         | 
|  | |
|  | |
| 425 | 
             
                    audios = []
         | 
| 426 | 
             
                    for voice in voice_tasks:
         | 
| 427 | 
             
                        if voice.get("break"):
         | 
|  | |
| 437 |  | 
| 438 | 
             
                    audio = np.concatenate(audios, axis=0)
         | 
| 439 | 
             
                    encoded_audio = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
         | 
| 440 | 
            +
                    if getattr(config, "SAVE_AUDIO", False):
         | 
| 441 | 
             
                        path = f"{config.CACHE_PATH}/{fname}"
         | 
| 442 | 
             
                        utils.save_audio(encoded_audio.getvalue(), path)
         | 
| 443 | 
            +
                    return encoded_audio
         | 
| 444 |  | 
| 445 | 
             
                def vits_infer(self, voice, fname):
         | 
| 446 | 
             
                    format = voice.get("format", "wav")
         | 
|  | |
| 449 | 
             
                    sampling_rate = voice_obj.hps_ms.data.sampling_rate
         | 
| 450 | 
             
                    audio = voice_obj.get_audio(voice, auto_break=True)
         | 
| 451 | 
             
                    encoded_audio = self.encode(sampling_rate, audio, format)
         | 
| 452 | 
            +
                    if getattr(config, "SAVE_AUDIO", False):
         | 
| 453 | 
             
                        path = f"{config.CACHE_PATH}/{fname}"
         | 
| 454 | 
             
                        utils.save_audio(encoded_audio.getvalue(), path)
         | 
| 455 | 
             
                    return encoded_audio
         | 
|  | |
| 465 | 
             
                        encoded_audio = self.encode(sampling_rate, chunk, format)
         | 
| 466 | 
             
                        for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio):
         | 
| 467 | 
             
                            yield encoded_audio_chunk
         | 
| 468 | 
            +
                        if getattr(config, "SAVE_AUDIO", False):
         | 
| 469 | 
            +
                            audio.write(encoded_audio.getvalue())
         | 
| 470 | 
            +
                    if getattr(config, "SAVE_AUDIO", False):
         | 
| 471 | 
             
                        path = f"{config.CACHE_PATH}/{fname}"
         | 
| 472 | 
             
                        utils.save_audio(audio.getvalue(), path)
         | 
| 473 |  | 
|  | |
| 478 | 
             
                    sampling_rate = voice_obj.hps_ms.data.sampling_rate
         | 
| 479 | 
             
                    audio = voice_obj.get_audio(voice)
         | 
| 480 | 
             
                    encoded_audio = self.encode(sampling_rate, audio, format)
         | 
| 481 | 
            +
                    if getattr(config, "SAVE_AUDIO", False):
         | 
| 482 | 
             
                        path = f"{config.CACHE_PATH}/{fname}"
         | 
| 483 | 
             
                        utils.save_audio(encoded_audio.getvalue(), path)
         | 
| 484 | 
             
                    return encoded_audio
         | 
|  | |
| 490 | 
             
                    sampling_rate = voice_obj.hps_ms.data.sampling_rate
         | 
| 491 | 
             
                    audio = voice_obj.get_audio(voice, auto_break=True)
         | 
| 492 | 
             
                    encoded_audio = self.encode(sampling_rate, audio, format)
         | 
| 493 | 
            +
                    if getattr(config, "SAVE_AUDIO", False):
         | 
| 494 | 
             
                        path = f"{config.CACHE_PATH}/{fname}"
         | 
| 495 | 
             
                        utils.save_audio(encoded_audio.getvalue(), path)
         | 
| 496 | 
             
                    return encoded_audio
         | 
|  | |
| 514 |  | 
| 515 | 
             
                    audio = voice_obj.voice_conversion(voice)
         | 
| 516 | 
             
                    encoded_audio = self.encode(sampling_rate, audio, format)
         | 
| 517 | 
            +
                    if getattr(config, "SAVE_AUDIO", False):
         | 
| 518 | 
             
                        path = f"{config.CACHE_PATH}/{fname}"
         | 
| 519 | 
             
                        utils.save_audio(encoded_audio.getvalue(), path)
         | 
| 520 | 
             
                    return encoded_audio
         | 
 
			
