Spaces:

aroluo
/

SPKTTS

Runtime error

App Files Files Community

aroluo commited on Apr 16

Commit

64d4f68

verified ·

1 Parent(s): ab144d7

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +15 -0
.gitignore +174 -0
LICENSE +201 -0
README.md +342 -6
cli/SparkTTS.py +236 -0
cli/inference.py +116 -0
example/infer.sh +47 -0
example/prompt_audio.wav +3 -0
requirements.txt +11 -0
runtime/triton_trtllm/Dockerfile.server +5 -0
runtime/triton_trtllm/README.md +94 -0
runtime/triton_trtllm/client_grpc.py +831 -0
runtime/triton_trtllm/client_http.py +165 -0
runtime/triton_trtllm/docker-compose.yml +20 -0
runtime/triton_trtllm/model_repo/audio_tokenizer/1/model.py +137 -0
runtime/triton_trtllm/model_repo/audio_tokenizer/config.pbtxt +58 -0
runtime/triton_trtllm/model_repo/spark_tts/1/model.py +404 -0
runtime/triton_trtllm/model_repo/spark_tts/config.pbtxt +86 -0
runtime/triton_trtllm/model_repo/tensorrt_llm/1/.gitkeep +0 -0
runtime/triton_trtllm/model_repo/tensorrt_llm/config.pbtxt +857 -0
runtime/triton_trtllm/model_repo/vocoder/1/model.py +106 -0
runtime/triton_trtllm/model_repo/vocoder/config.pbtxt +53 -0
runtime/triton_trtllm/run.sh +109 -0
runtime/triton_trtllm/scripts/convert_checkpoint.py +335 -0
runtime/triton_trtllm/scripts/fill_template.py +70 -0
sparktts/models/audio_tokenizer.py +163 -0
sparktts/models/bicodec.py +247 -0
sparktts/modules/blocks/layers.py +73 -0
sparktts/modules/blocks/samper.py +115 -0
sparktts/modules/blocks/vocos.py +373 -0
sparktts/modules/encoder_decoder/feat_decoder.py +115 -0
sparktts/modules/encoder_decoder/feat_encoder.py +105 -0
sparktts/modules/encoder_decoder/wave_generator.py +88 -0
sparktts/modules/fsq/finite_scalar_quantization.py +251 -0
sparktts/modules/fsq/residual_fsq.py +355 -0
sparktts/modules/speaker/ecapa_tdnn.py +267 -0
sparktts/modules/speaker/perceiver_encoder.py +360 -0
sparktts/modules/speaker/pooling_layers.py +298 -0
sparktts/modules/speaker/speaker_encoder.py +136 -0
sparktts/modules/vq/factorized_vector_quantize.py +187 -0
sparktts/utils/__init__.py +0 -0
sparktts/utils/audio.py +271 -0
sparktts/utils/file.py +221 -0
sparktts/utils/parse_options.sh +97 -0
sparktts/utils/token_parser.py +187 -0
src/demos/trump/trump_en.wav +3 -0
src/demos/zhongli/zhongli_en.wav +3 -0
src/demos/余承东/yuchengdong_zh.wav +3 -0
src/demos/刘德华/dehua_zh.wav +3 -0
src/demos/哪吒/nezha_zh.wav +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,18 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+example/prompt_audio.wav filter=lfs diff=lfs merge=lfs -text
+src/demos/trump/trump_en.wav filter=lfs diff=lfs merge=lfs -text
+src/demos/zhongli/zhongli_en.wav filter=lfs diff=lfs merge=lfs -text
+src/demos/余承东/yuchengdong_zh.wav filter=lfs diff=lfs merge=lfs -text
+src/demos/刘德华/dehua_zh.wav filter=lfs diff=lfs merge=lfs -text
+src/demos/哪吒/nezha_zh.wav filter=lfs diff=lfs merge=lfs -text
+src/demos/徐志胜/zhisheng_zh.wav filter=lfs diff=lfs merge=lfs -text
+src/demos/李靖/lijing_zh.wav filter=lfs diff=lfs merge=lfs -text
+src/demos/杨澜/yanglan_zh.wav filter=lfs diff=lfs merge=lfs -text
+src/demos/马云/mayun_zh.wav filter=lfs diff=lfs merge=lfs -text
+src/demos/鲁豫/luyu_zh.wav filter=lfs diff=lfs merge=lfs -text
+src/figures/infer_control.png filter=lfs diff=lfs merge=lfs -text
+src/figures/infer_voice_cloning.png filter=lfs diff=lfs merge=lfs -text
+src/logo/mobvoi.png filter=lfs diff=lfs merge=lfs -text
+src/logo/SparkTTS.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,174 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+pretrained_models/
+results/
+demo/
+# C extensions
+*.so
+.gradio/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+webui_test.py
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,12 +1,348 @@
 ---
 title: SPKTTS
-emoji: 💻
-colorFrom: yellow
-colorTo: red
 sdk: gradio
 sdk_version: 5.25.2
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: SPKTTS
+app_file: webui.py
 sdk: gradio
 sdk_version: 5.25.2
 ---
+<div align="center">
+    <h1>
+    Spark-TTS
+    </h1>
+    <p>
+    Official PyTorch code for inference of <br>
+    <b><em>Spark-TTS: An Efficient LLM-Based Text-to-Speech Model with Single-Stream Decoupled Speech Tokens</em></b>
+    </p>
+    <p>
+    <img src="src/logo/SparkTTS.jpg" alt="Spark-TTS Logo" style="width: 200px; height: 200px;">
+    </p>
+        <p>
+        <img src="src/logo/HKUST.jpg" alt="Institution 1" style="width: 200px; height: 60px;">
+        <img src="src/logo/mobvoi.jpg" alt="Institution 2" style="width: 200px; height: 60px;">
+        <img src="src/logo/SJU.jpg" alt="Institution 3" style="width: 200px; height: 60px;">
+    </p>
+    <p>
+        <img src="src/logo/NTU.jpg" alt="Institution 4" style="width: 200px; height: 60px;">
+        <img src="src/logo/NPU.jpg" alt="Institution 5" style="width: 200px; height: 60px;">
+        <img src="src/logo/SparkAudio2.jpg" alt="Institution 6" style="width: 200px; height: 60px;">
+    </p>
+    <p>
+    </p>
+    <a href="https://arxiv.org/pdf/2503.01710"><img src="https://img.shields.io/badge/Paper-ArXiv-red" alt="paper"></a>
+    <a href="https://sparkaudio.github.io/spark-tts/"><img src="https://img.shields.io/badge/Demo-Page-lightgrey" alt="version"></a>
+    <a href="https://huggingface.co/SparkAudio/Spark-TTS-0.5B"><img src="https://img.shields.io/badge/Hugging%20Face-Model%20Page-yellow" alt="Hugging Face"></a>
+    <a href="https://github.com/SparkAudio/Spark-TTS"><img src="https://img.shields.io/badge/Platform-linux-lightgrey" alt="version"></a>
+    <a href="https://github.com/SparkAudio/Spark-TTS"><img src="https://img.shields.io/badge/Python-3.12+-orange" alt="version"></a>
+    <a href="https://github.com/SparkAudio/Spark-TTS"><img src="https://img.shields.io/badge/PyTorch-2.5+-brightgreen" alt="python"></a>
+    <a href="https://github.com/SparkAudio/Spark-TTS"><img src="https://img.shields.io/badge/License-Apache%202.0-blue.svg" alt="mit"></a>
+</div>
+## Spark-TTS 🔥
+### Overview
+Spark-TTS is an advanced text-to-speech system that uses the power of large language models (LLM) for highly accurate and natural-sounding voice synthesis. It is designed to be efficient, flexible, and powerful for both research and production use.
+### Key Features
+- **Simplicity and Efficiency**: Built entirely on Qwen2.5, Spark-TTS eliminates the need for additional generation models like flow matching. Instead of relying on separate models to generate acoustic features, it directly reconstructs audio from the code predicted by the LLM. This approach streamlines the process, improving efficiency and reducing complexity.
+- **High-Quality Voice Cloning**: Supports zero-shot voice cloning, which means it can replicate a speaker's voice even without specific training data for that voice. This is ideal for cross-lingual and code-switching scenarios, allowing for seamless transitions between languages and voices without requiring separate training for each one.
+- **Bilingual Support**: Supports both Chinese and English, and is capable of zero-shot voice cloning for cross-lingual and code-switching scenarios, enabling the model to synthesize speech in multiple languages with high naturalness and accuracy.
+- **Controllable Speech Generation**: Supports creating virtual speakers by adjusting parameters such as gender, pitch, and speaking rate.
+---
+<table align="center">
+  <tr>
+    <td align="center"><b>Inference Overview of Voice Cloning</b><br><img src="src/figures/infer_voice_cloning.png" width="80%" /></td>
+  </tr>
+  <tr>
+    <td align="center"><b>Inference Overview of Controlled Generation</b><br><img src="src/figures/infer_control.png" width="80%" /></td>
+  </tr>
+</table>
+## 🚀 News
+- **[2025-03-04]** Our paper on this project has been published! You can read it here: [Spark-TTS](https://arxiv.org/pdf/2503.01710).
+- **[2025-03-12]** Nvidia Triton Inference Serving is now supported. See the Runtime section below for more details.
+## Install
+**Clone and Install**
+  Here are instructions for installing on Linux. If you're on Windows, please refer to the [Windows Installation Guide](https://github.com/SparkAudio/Spark-TTS/issues/5).
+*(Thanks to [@AcTePuKc](https://github.com/AcTePuKc) for the detailed Windows instructions!)*
+- Clone the repo
+``` sh
+git clone https://github.com/SparkAudio/Spark-TTS.git
+cd Spark-TTS
+```
+- Install Conda: please see https://docs.conda.io/en/latest/miniconda.html
+- Create Conda env:
+``` sh
+conda create -n sparktts -y python=3.12
+conda activate sparktts
+pip install -r requirements.txt
+# If you are in mainland China, you can set the mirror as follows:
+pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com
+```
+**Model Download**
+Download via python:
+```python
+from huggingface_hub import snapshot_download
+snapshot_download("SparkAudio/Spark-TTS-0.5B", local_dir="pretrained_models/Spark-TTS-0.5B")
+```
+Download via git clone:
+```sh
+mkdir -p pretrained_models
+# Make sure you have git-lfs installed (https://git-lfs.com)
+git lfs install
+git clone https://huggingface.co/SparkAudio/Spark-TTS-0.5B pretrained_models/Spark-TTS-0.5B
+```
+**Basic Usage**
+You can simply run the demo with the following commands:
+``` sh
+cd example
+bash infer.sh
+```
+Alternatively, you can directly execute the following command in the command line to perform inference：
+``` sh
+python -m cli.inference \
+    --text "text to synthesis." \
+    --device 0 \
+    --save_dir "path/to/save/audio" \
+    --model_dir pretrained_models/Spark-TTS-0.5B \
+    --prompt_text "transcript of the prompt audio" \
+    --prompt_speech_path "path/to/prompt_audio"
+```
+**Web UI Usage**
+You can start the UI interface by running `python webui.py --device 0`, which allows you to perform Voice Cloning and Voice Creation. Voice Cloning supports uploading reference audio or directly recording the audio.
+| **Voice Cloning** | **Voice Creation** |
+|:-------------------:|:-------------------:|
+| ![Image 1](src/figures/gradio_TTS.png) | ![Image 2](src/figures/gradio_control.png) |
+**Optional Methods**
+For additional CLI and Web UI methods, including alternative implementations and extended functionalities, you can refer to:
+- [CLI and UI by AcTePuKc](https://github.com/SparkAudio/Spark-TTS/issues/10)
+## Runtime
+**Nvidia Triton Inference Serving**
+We now provide a reference for deploying Spark-TTS with Nvidia Triton and TensorRT-LLM. The table below presents benchmark results on a single L20 GPU, using 26 different prompt_audio/target_text pairs (totalling 169 seconds of audio):
+| Model | Note   | Concurrency | Avg Latency     | RTF |
+|-------|-----------|-----------------------|---------|--|
+| Spark-TTS-0.5B | [Code Commit](https://github.com/SparkAudio/Spark-TTS/tree/4d769ff782a868524f29e0be851ca64f8b22ebf1/runtime/triton_trtllm) | 1                   | 876.24 ms | 0.1362|
+| Spark-TTS-0.5B | [Code Commit](https://github.com/SparkAudio/Spark-TTS/tree/4d769ff782a868524f29e0be851ca64f8b22ebf1/runtime/triton_trtllm) | 2                   | 920.97 ms | 0.0737|
+| Spark-TTS-0.5B | [Code Commit](https://github.com/SparkAudio/Spark-TTS/tree/4d769ff782a868524f29e0be851ca64f8b22ebf1/runtime/triton_trtllm) | 4                   | 1611.51 ms | 0.0704|
+Please see the detailed instructions in [runtime/triton_trtllm/README.md](runtime/triton_trtllm/README.md ) for more information.
+## **Demos**
+Here are some demos generated by Spark-TTS using zero-shot voice cloning. For more demos, visit our [demo page](https://sparkaudio.github.io/spark-tts/).
+---
+<table>
+<tr>
+<td align="center">
+**Donald Trump**
+</td>
+<td align="center">
+**Zhongli (Genshin Impact)**
+</td>
+</tr>
+<tr>
+<td align="center">
+[Donald Trump](https://github.com/user-attachments/assets/fb225780-d9fe-44b2-9b2e-54390cb3d8fd)
+</td>
+<td align="center">
+[Zhongli](https://github.com/user-attachments/assets/80eeb9c7-0443-4758-a1ce-55ac59e64bd6)
+</td>
+</tr>
+</table>
+---
+<table>
+<tr>
+<td align="center">
+**陈鲁豫 Chen Luyu**
+</td>
+<td align="center">
+**杨澜 Yang Lan**
+</td>
+</tr>
+<tr>
+<td align="center">
+[陈鲁豫Chen_Luyu.webm](https://github.com/user-attachments/assets/5c6585ae-830d-47b1-992d-ee3691f48cf4)
+</td>
+<td align="center">
+[Yang_Lan.webm](https://github.com/user-attachments/assets/2fb3d00c-abc3-410e-932f-46ba204fb1d7)
+</td>
+</tr>
+</table>
+---
+<table>
+<tr>
+<td align="center">
+**余承东 Richard Yu**
+</td>
+<td align="center">
+**马云 Jack Ma**
+</td>
+</tr>
+<tr>
+<td align="center">
+[Yu_Chengdong.webm](https://github.com/user-attachments/assets/78feca02-84bb-4d3a-a770-0cfd02f1a8da)
+</td>
+<td align="center">
+[Ma_Yun.webm](https://github.com/user-attachments/assets/2d54e2eb-cec4-4c2f-8c84-8fe587da321b)
+</td>
+</tr>
+</table>
+---
+<table>
+<tr>
+<td align="center">
+**刘德华 Andy Lau**
+</td>
+<td align="center">
+**徐志胜 Xu Zhisheng**
+</td>
+</tr>
+<tr>
+<td align="center">
+[Liu_Dehua.webm](https://github.com/user-attachments/assets/195b5e97-1fee-4955-b954-6d10fa04f1d7)
+</td>
+<td align="center">
+[Xu_Zhisheng.webm](https://github.com/user-attachments/assets/dd812af9-76bd-4e26-9988-9cdb9ccbb87b)
+</td>
+</tr>
+</table>
+---
+<table>
+<tr>
+<td align="center">
+**哪吒 Nezha**
+</td>
+<td align="center">
+**李靖 Li Jing**
+</td>
+</tr>
+<tr>
+<td align="center">
+[Ne_Zha.webm](https://github.com/user-attachments/assets/8c608037-a17a-46d4-8588-4db34b49ed1d)
+</td>
+<td align="center">
+[Li_Jing.webm](https://github.com/user-attachments/assets/aa8ba091-097c-4156-b4e3-6445da5ea101)
+</td>
+</tr>
+</table>
+## To-Do List
+- [x] Release the Spark-TTS paper.
+- [ ] Release the training code.
+- [ ] Release the training dataset, VoxBox.
+## Citation
+```
+@misc{wang2025sparktts,
+      title={Spark-TTS: An Efficient LLM-Based Text-to-Speech Model with Single-Stream Decoupled Speech Tokens},
+      author={Xinsheng Wang and Mingqi Jiang and Ziyang Ma and Ziyu Zhang and Songxiang Liu and Linqin Li and Zheng Liang and Qixi Zheng and Rui Wang and Xiaoqin Feng and Weizhen Bian and Zhen Ye and Sitong Cheng and Ruibin Yuan and Zhixian Zhao and Xinfa Zhu and Jiahao Pan and Liumeng Xue and Pengcheng Zhu and Yunlin Chen and Zhifei Li and Xie Chen and Lei Xie and Yike Guo and Wei Xue},
+      year={2025},
+      eprint={2503.01710},
+      archivePrefix={arXiv},
+      primaryClass={cs.SD},
+      url={https://arxiv.org/abs/2503.01710},
+}
+```
+## ⚠️ Usage Disclaimer
+This project provides a zero-shot voice cloning TTS model intended for academic research, educational purposes, and legitimate applications, such as personalized speech synthesis, assistive technologies, and linguistic research.
+Please note:
+- Do not use this model for unauthorized voice cloning, impersonation, fraud, scams, deepfakes, or any illegal activities.
+- Ensure compliance with local laws and regulations when using this model and uphold ethical standards.
+- The developers assume no liability for any misuse of this model.
+We advocate for the responsible development and use of AI and encourage the community to uphold safety and ethical principles in AI research and applications. If you have any concerns regarding ethics or misuse, please contact us.

cli/SparkTTS.py ADDED Viewed

	@@ -0,0 +1,236 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+import torch
+from typing import Tuple
+from pathlib import Path
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from sparktts.utils.file import load_config
+from sparktts.models.audio_tokenizer import BiCodecTokenizer
+from sparktts.utils.token_parser import LEVELS_MAP, GENDER_MAP, TASK_TOKEN_MAP
+class SparkTTS:
+    """
+    Spark-TTS for text-to-speech generation.
+    """
+    def __init__(self, model_dir: Path, device: torch.device = torch.device("cuda:0")):
+        """
+        Initializes the SparkTTS model with the provided configurations and device.
+        Args:
+            model_dir (Path): Directory containing the model and config files.
+            device (torch.device): The device (CPU/GPU) to run the model on.
+        """
+        self.device = device
+        self.model_dir = model_dir
+        self.configs = load_config(f"{model_dir}/config.yaml")
+        self.sample_rate = self.configs["sample_rate"]
+        self._initialize_inference()
+    def _initialize_inference(self):
+        """Initializes the tokenizer, model, and audio tokenizer for inference."""
+        self.tokenizer = AutoTokenizer.from_pretrained(f"{self.model_dir}/LLM")
+        self.model = AutoModelForCausalLM.from_pretrained(f"{self.model_dir}/LLM")
+        self.audio_tokenizer = BiCodecTokenizer(self.model_dir, device=self.device)
+        self.model.to(self.device)
+    def process_prompt(
+        self,
+        text: str,
+        prompt_speech_path: Path,
+        prompt_text: str = None,
+    ) -> Tuple[str, torch.Tensor]:
+        """
+        Process input for voice cloning.
+        Args:
+            text (str): The text input to be converted to speech.
+            prompt_speech_path (Path): Path to the audio file used as a prompt.
+            prompt_text (str, optional): Transcript of the prompt audio.
+        Return:
+            Tuple[str, torch.Tensor]: Input prompt; global tokens
+        """
+        global_token_ids, semantic_token_ids = self.audio_tokenizer.tokenize(
+            prompt_speech_path
+        )
+        global_tokens = "".join(
+            [f"<|bicodec_global_{i}|>" for i in global_token_ids.squeeze()]
+        )
+        # Prepare the input tokens for the model
+        if prompt_text is not None:
+            semantic_tokens = "".join(
+                [f"<|bicodec_semantic_{i}|>" for i in semantic_token_ids.squeeze()]
+            )
+            inputs = [
+                TASK_TOKEN_MAP["tts"],
+                "<|start_content|>",
+                prompt_text,
+                text,
+                "<|end_content|>",
+                "<|start_global_token|>",
+                global_tokens,
+                "<|end_global_token|>",
+                "<|start_semantic_token|>",
+                semantic_tokens,
+            ]
+        else:
+            inputs = [
+                TASK_TOKEN_MAP["tts"],
+                "<|start_content|>",
+                text,
+                "<|end_content|>",
+                "<|start_global_token|>",
+                global_tokens,
+                "<|end_global_token|>",
+            ]
+        inputs = "".join(inputs)
+        return inputs, global_token_ids
+    def process_prompt_control(
+        self,
+        gender: str,
+        pitch: str,
+        speed: str,
+        text: str,
+    ):
+        """
+        Process input for voice creation.
+        Args:
+            gender (str): female | male.
+            pitch (str): very_low | low | moderate | high | very_high
+            speed (str): very_low | low | moderate | high | very_high
+            text (str): The text input to be converted to speech.
+        Return:
+            str: Input prompt
+        """
+        assert gender in GENDER_MAP.keys()
+        assert pitch in LEVELS_MAP.keys()
+        assert speed in LEVELS_MAP.keys()
+        gender_id = GENDER_MAP[gender]
+        pitch_level_id = LEVELS_MAP[pitch]
+        speed_level_id = LEVELS_MAP[speed]
+        pitch_label_tokens = f"<|pitch_label_{pitch_level_id}|>"
+        speed_label_tokens = f"<|speed_label_{speed_level_id}|>"
+        gender_tokens = f"<|gender_{gender_id}|>"
+        attribte_tokens = "".join(
+            [gender_tokens, pitch_label_tokens, speed_label_tokens]
+        )
+        control_tts_inputs = [
+            TASK_TOKEN_MAP["controllable_tts"],
+            "<|start_content|>",
+            text,
+            "<|end_content|>",
+            "<|start_style_label|>",
+            attribte_tokens,
+            "<|end_style_label|>",
+        ]
+        return "".join(control_tts_inputs)
+    @torch.no_grad()
+    def inference(
+        self,
+        text: str,
+        prompt_speech_path: Path = None,
+        prompt_text: str = None,
+        gender: str = None,
+        pitch: str = None,
+        speed: str = None,
+        temperature: float = 0.8,
+        top_k: float = 50,
+        top_p: float = 0.95,
+    ) -> torch.Tensor:
+        """
+        Performs inference to generate speech from text, incorporating prompt audio and/or text.
+        Args:
+            text (str): The text input to be converted to speech.
+            prompt_speech_path (Path): Path to the audio file used as a prompt.
+            prompt_text (str, optional): Transcript of the prompt audio.
+            gender (str): female | male.
+            pitch (str): very_low | low | moderate | high | very_high
+            speed (str): very_low | low | moderate | high | very_high
+            temperature (float, optional): Sampling temperature for controlling randomness. Default is 0.8.
+            top_k (float, optional): Top-k sampling parameter. Default is 50.
+            top_p (float, optional): Top-p (nucleus) sampling parameter. Default is 0.95.
+        Returns:
+            torch.Tensor: Generated waveform as a tensor.
+        """
+        if gender is not None:
+            prompt = self.process_prompt_control(gender, pitch, speed, text)
+        else:
+            prompt, global_token_ids = self.process_prompt(
+                text, prompt_speech_path, prompt_text
+            )
+        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
+        # Generate speech using the model
+        generated_ids = self.model.generate(
+            **model_inputs,
+            max_new_tokens=3000,
+            do_sample=True,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+        )
+        # Trim the output tokens to remove the input tokens
+        generated_ids = [
+            output_ids[len(input_ids) :]
+            for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+        ]
+        # Decode the generated tokens into text
+        predicts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        # Extract semantic token IDs from the generated text
+        pred_semantic_ids = (
+            torch.tensor([int(token) for token in re.findall(r"bicodec_semantic_(\d+)", predicts)])
+            .long()
+            .unsqueeze(0)
+        )
+        if gender is not None:
+            global_token_ids = (
+                torch.tensor([int(token) for token in re.findall(r"bicodec_global_(\d+)", predicts)])
+                .long()
+                .unsqueeze(0)
+                .unsqueeze(0)
+            )
+        # Convert semantic tokens back to waveform
+        wav = self.audio_tokenizer.detokenize(
+            global_token_ids.to(self.device).squeeze(0),
+            pred_semantic_ids.to(self.device),
+        )
+        return wav

cli/inference.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+import torch
+import soundfile as sf
+import logging
+from datetime import datetime
+import platform
+from cli.SparkTTS import SparkTTS
+def parse_args():
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(description="Run TTS inference.")
+    parser.add_argument(
+        "--model_dir",
+        type=str,
+        default="pretrained_models/Spark-TTS-0.5B",
+        help="Path to the model directory",
+    )
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        default="example/results",
+        help="Directory to save generated audio files",
+    )
+    parser.add_argument("--device", type=int, default=0, help="CUDA device number")
+    parser.add_argument(
+        "--text", type=str, required=True, help="Text for TTS generation"
+    )
+    parser.add_argument("--prompt_text", type=str, help="Transcript of prompt audio")
+    parser.add_argument(
+        "--prompt_speech_path",
+        type=str,
+        help="Path to the prompt audio file",
+    )
+    parser.add_argument("--gender", choices=["male", "female"])
+    parser.add_argument(
+        "--pitch", choices=["very_low", "low", "moderate", "high", "very_high"]
+    )
+    parser.add_argument(
+        "--speed", choices=["very_low", "low", "moderate", "high", "very_high"]
+    )
+    return parser.parse_args()
+def run_tts(args):
+    """Perform TTS inference and save the generated audio."""
+    logging.info(f"Using model from: {args.model_dir}")
+    logging.info(f"Saving audio to: {args.save_dir}")
+    # Ensure the save directory exists
+    os.makedirs(args.save_dir, exist_ok=True)
+    # Convert device argument to torch.device
+    if platform.system() == "Darwin" and torch.backends.mps.is_available():
+        # macOS with MPS support (Apple Silicon)
+        device = torch.device(f"mps:{args.device}")
+        logging.info(f"Using MPS device: {device}")
+    elif torch.cuda.is_available():
+        # System with CUDA support
+        device = torch.device(f"cuda:{args.device}")
+        logging.info(f"Using CUDA device: {device}")
+    else:
+        # Fall back to CPU
+        device = torch.device("cpu")
+        logging.info("GPU acceleration not available, using CPU")
+    # Initialize the model
+    model = SparkTTS(args.model_dir, device)
+    # Generate unique filename using timestamp
+    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+    save_path = os.path.join(args.save_dir, f"{timestamp}.wav")
+    logging.info("Starting inference...")
+    # Perform inference and save the output audio
+    with torch.no_grad():
+        wav = model.inference(
+            args.text,
+            args.prompt_speech_path,
+            prompt_text=args.prompt_text,
+            gender=args.gender,
+            pitch=args.pitch,
+            speed=args.speed,
+        )
+        sf.write(save_path, wav, samplerate=16000)
+    logging.info(f"Audio saved at: {save_path}")
+if __name__ == "__main__":
+    logging.basicConfig(
+        level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+    )
+    args = parse_args()
+    run_tts(args)

example/infer.sh ADDED Viewed

	@@ -0,0 +1,47 @@

+#!/bin/bash
+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Get the absolute path of the script's directory
+script_dir=$(dirname "$(realpath "$0")")
+# Get the root directory
+root_dir=$(dirname "$script_dir")
+# Set default parameters
+device=0
+save_dir='example/results'
+model_dir="pretrained_models/Spark-TTS-0.5B"
+text="身临其境，换新体验。塑造开源语音合成新范式，让智能语音更自然。"
+prompt_text="吃燕窝就选燕之屋，本节目由26年专注高品质燕窝的燕之屋冠名播出。豆奶牛奶换着喝，营养更均衡，本节目由豆本豆豆奶特约播出。"
+prompt_speech_path="example/prompt_audio.wav"
+# Change directory to the root directory
+cd "$root_dir" || exit
+source sparktts/utils/parse_options.sh
+# Run inference
+python -m cli.inference \
+    --text "${text}" \
+    --device "${device}" \
+    --save_dir "${save_dir}" \
+    --model_dir "${model_dir}" \
+    --prompt_text "${prompt_text}" \
+    --prompt_speech_path "${prompt_speech_path}"

example/prompt_audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:335e7f7789b231cd90d9670292d561ecfe6a6bdd5e737a7bc6c29730741852de
+size 318550

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+einops==0.8.1
+einx==0.3.0
+numpy==2.2.3
+omegaconf==2.3.0
+packaging==24.2
+safetensors==0.5.2
+soundfile==0.12.1
+soxr==0.5.0.post1
+tqdm==4.66.5
+transformers==4.46.2
+gradio==5.18.0

runtime/triton_trtllm/Dockerfile.server ADDED Viewed

	@@ -0,0 +1,5 @@

+FROM nvcr.io/nvidia/tritonserver:25.02-trtllm-python-py3
+RUN apt-get update && apt-get install -y cmake
+RUN git clone https://github.com/pytorch/audio.git && cd audio && git checkout c670ad8 && PATH=/usr/local/cuda/bin:$PATH python3 setup.py develop
+RUN pip install einx==0.3.0 omegaconf==2.3.0 soundfile==0.12.1 soxr==0.5.0.post1 gradio tritonclient librosa
+WORKDIR /workspace

runtime/triton_trtllm/README.md ADDED Viewed

	@@ -0,0 +1,94 @@

+## Nvidia Triton Inference Serving Best Practice for Spark TTS
+### Quick Start
+Directly launch the service using docker compose.
+```sh
+docker compose up
+```
+### Build Image
+Build the docker image from scratch.
+```sh
+docker build . -f Dockerfile.server -t soar97/triton-spark-tts:25.02
+```
+### Create Docker Container
+```sh
+your_mount_dir=/mnt:/mnt
+docker run -it --name "spark-tts-server" --gpus all --net host -v $your_mount_dir --shm-size=2g soar97/triton-spark-tts:25.02
+```
+### Understanding `run.sh`
+The `run.sh` script automates various steps using stages. You can run specific stages using:
+```sh
+bash run.sh <start_stage> <stop_stage> [service_type]
+```
+- `<start_stage>`: The stage to begin execution from (0-5).
+- `<stop_stage>`: The stage to end execution at (0-5).
+- `[service_type]`: Optional, specifies the service type ('streaming' or 'offline', defaults may apply based on script logic). Required for stages 4 and 5.
+Stages:
+- **Stage 0**: Download Spark-TTS-0.5B model from HuggingFace.
+- **Stage 1**: Convert HuggingFace checkpoint to TensorRT-LLM format and build TensorRT engines.
+- **Stage 2**: Create the Triton model repository structure and configure model files (adjusts for streaming/offline).
+- **Stage 3**: Launch the Triton Inference Server.
+- **Stage 4**: Run the gRPC benchmark client.
+- **Stage 5**: Run the single utterance client (gRPC for streaming, HTTP for offline).
+### Export Models to TensorRT-LLM and Launch Server
+Inside the docker container, you can prepare the models and launch the Triton server by running stages 0 through 3. This involves downloading the original model, converting it to TensorRT-LLM format, building the optimized TensorRT engines, creating the necessary model repository structure for Triton, and finally starting the server.
+```sh
+# This runs stages 0, 1, 2, and 3
+bash run.sh 0 3
+```
+*Note: Stage 2 prepares the model repository differently based on whether you intend to run streaming or offline inference later. You might need to re-run stage 2 if switching service types.*
+### Single Utterance Client
+Run a single inference request. Specify `streaming` or `offline` as the third argument.
+**Streaming Mode (gRPC):**
+```sh
+bash run.sh 5 5 streaming
+```
+This executes the `client_grpc.py` script with predefined example text and prompt audio in streaming mode.
+**Offline Mode (HTTP):**
+```sh
+bash run.sh 5 5 offline
+```
+### Benchmark using Dataset
+Run the benchmark client against the running Triton server. Specify `streaming` or `offline` as the third argument.
+```sh
+# Run benchmark in streaming mode
+bash run.sh 4 4 streaming
+# Run benchmark in offline mode
+bash run.sh 4 4 offline
+# You can also customize parameters like num_task directly in client_grpc.py or via args if supported
+# Example from run.sh (streaming):
+# python3 client_grpc.py \
+#     --server-addr localhost \
+#     --model-name spark_tts \
+#     --num-tasks 2 \
+#     --mode streaming \
+#     --log-dir ./log_concurrent_tasks_2_streaming_new
+# Example customizing dataset (requires modifying client_grpc.py or adding args):
+# python3 client_grpc.py --num-tasks 2 --huggingface-dataset yuekai/seed_tts --split-name wenetspeech4tts --mode [streaming|offline]
+```
+### Benchmark Results
+Decoding on a single L20 GPU, using 26 different prompt_audio/target_text [pairs](https://huggingface.co/datasets/yuekai/seed_tts), total audio duration 169 secs.
+| Mode | Note   | Concurrency | Avg Latency     | First Chunk Latency (P50) |  RTF |
+|-------|-----------|-----------------------|---------|----------------|-|
+| Offline | [Code Commit](https://github.com/SparkAudio/Spark-TTS/tree/4d769ff782a868524f29e0be851ca64f8b22ebf1/runtime/triton_trtllm) | 1                   | 876.24 ms |-| 0.1362|
+| Offline | [Code Commit](https://github.com/SparkAudio/Spark-TTS/tree/4d769ff782a868524f29e0be851ca64f8b22ebf1/runtime/triton_trtllm) | 2                   | 920.97 ms |-|0.0737|
+| Offline | [Code Commit](https://github.com/SparkAudio/Spark-TTS/tree/4d769ff782a868524f29e0be851ca64f8b22ebf1/runtime/triton_trtllm) | 4                   | 1611.51 ms |-| 0.0704|
+| Streaming | [Code Commit](https://github.com/yuekaizhang/Spark-TTS/commit/0e978a327f99aa49f0735f86eb09372f16410d86) | 1                   | 913.28 ms |210.42 ms| 0.1501 |
+| Streaming | [Code Commit](https://github.com/yuekaizhang/Spark-TTS/commit/0e978a327f99aa49f0735f86eb09372f16410d86) | 2                   | 1009.23 ms |226.08 ms |0.0862 |
+| Streaming | [Code Commit](https://github.com/yuekaizhang/Spark-TTS/commit/0e978a327f99aa49f0735f86eb09372f16410d86) | 4                   | 1793.86 ms |1017.70 ms| 0.0824 |

runtime/triton_trtllm/client_grpc.py ADDED Viewed

	@@ -0,0 +1,831 @@

+#!/usr/bin/env python3
+# Copyright      2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#                2023  Nvidia              (authors: Yuekai Zhang)
+#                2023  Recurrent.ai        (authors: Songtao Shi)
+# See LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script supports to load dataset from huggingface and sends it to the server
+for decoding, in parallel.
+Usage:
+num_task=2
+# For offline F5-TTS
+python3 client_grpc.py \
+    --server-addr localhost \
+    --model-name f5_tts \
+    --num-tasks $num_task \
+    --huggingface-dataset yuekai/seed_tts \
+    --split-name test_zh \
+    --log-dir ./log_concurrent_tasks_${num_task}
+# For offline Spark-TTS-0.5B
+python3 client_grpc.py \
+    --server-addr localhost \
+    --model-name spark_tts \
+    --num-tasks $num_task \
+    --huggingface-dataset yuekai/seed_tts \
+    --split-name wenetspeech4tts \
+    --log-dir ./log_concurrent_tasks_${num_task}
+"""
+import argparse
+import asyncio
+import json
+import queue  # Added
+import uuid  # Added
+import functools # Added
+import os
+import time
+import types
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+import tritonclient
+import tritonclient.grpc.aio as grpcclient_aio # Renamed original import
+import tritonclient.grpc as grpcclient_sync # Added sync client import
+from tritonclient.utils import np_to_triton_dtype, InferenceServerException # Added InferenceServerException
+# --- Added UserData and callback ---
+class UserData:
+    def __init__(self):
+        self._completed_requests = queue.Queue()
+        self._first_chunk_time = None
+        self._start_time = None
+    def record_start_time(self):
+        self._start_time = time.time()
+    def get_first_chunk_latency(self):
+        if self._first_chunk_time and self._start_time:
+            return self._first_chunk_time - self._start_time
+        return None
+def callback(user_data, result, error):
+    if user_data._first_chunk_time is None and not error:
+        user_data._first_chunk_time = time.time() # Record time of first successful chunk
+    if error:
+        user_data._completed_requests.put(error)
+    else:
+        user_data._completed_requests.put(result)
+# --- End Added UserData and callback ---
+def write_triton_stats(stats, summary_file):
+    with open(summary_file, "w") as summary_f:
+        model_stats = stats["model_stats"]
+        # write a note, the log is from triton_client.get_inference_statistics(), to better human readability
+        summary_f.write(
+            "The log is parsing from triton_client.get_inference_statistics(), to better human readability. \n"
+        )
+        summary_f.write("To learn more about the log, please refer to: \n")
+        summary_f.write("1. https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md \n")
+        summary_f.write("2. https://github.com/triton-inference-server/server/issues/5374 \n\n")
+        summary_f.write(
+            "To better improve throughput, we always would like let requests wait in the queue for a while, and then execute them with a larger batch size. \n"
+        )
+        summary_f.write(
+            "However, there is a trade-off between the increased queue time and the increased batch size. \n"
+        )
+        summary_f.write(
+            "You may change 'max_queue_delay_microseconds' and 'preferred_batch_size' in the model configuration file to achieve this. \n"
+        )
+        summary_f.write(
+            "See https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#delayed-batching for more details. \n\n"
+        )
+        for model_state in model_stats:
+            if "last_inference" not in model_state:
+                continue
+            summary_f.write(f"model name is {model_state['name']} \n")
+            model_inference_stats = model_state["inference_stats"]
+            total_queue_time_s = int(model_inference_stats["queue"]["ns"]) / 1e9
+            total_infer_time_s = int(model_inference_stats["compute_infer"]["ns"]) / 1e9
+            total_input_time_s = int(model_inference_stats["compute_input"]["ns"]) / 1e9
+            total_output_time_s = int(model_inference_stats["compute_output"]["ns"]) / 1e9
+            summary_f.write(
+                f"queue time {total_queue_time_s:<5.2f} s, compute infer time {total_infer_time_s:<5.2f} s, compute input time {total_input_time_s:<5.2f} s, compute output time {total_output_time_s:<5.2f} s \n"  # noqa
+            )
+            model_batch_stats = model_state["batch_stats"]
+            for batch in model_batch_stats:
+                batch_size = int(batch["batch_size"])
+                compute_input = batch["compute_input"]
+                compute_output = batch["compute_output"]
+                compute_infer = batch["compute_infer"]
+                batch_count = int(compute_infer["count"])
+                assert compute_infer["count"] == compute_output["count"] == compute_input["count"]
+                compute_infer_time_ms = int(compute_infer["ns"]) / 1e6
+                compute_input_time_ms = int(compute_input["ns"]) / 1e6
+                compute_output_time_ms = int(compute_output["ns"]) / 1e6
+                summary_f.write(
+                    f"execuate inference with batch_size {batch_size:<2} total {batch_count:<5} times, total_infer_time {compute_infer_time_ms:<9.2f} ms, avg_infer_time {compute_infer_time_ms:<9.2f}/{batch_count:<5}={compute_infer_time_ms / batch_count:.2f} ms, avg_infer_time_per_sample {compute_infer_time_ms:<9.2f}/{batch_count:<5}/{batch_size}={compute_infer_time_ms / batch_count / batch_size:.2f} ms \n"  # noqa
+                )
+                summary_f.write(
+                    f"input {compute_input_time_ms:<9.2f} ms, avg {compute_input_time_ms / batch_count:.2f} ms, "  # noqa
+                )
+                summary_f.write(
+                    f"output {compute_output_time_ms:<9.2f} ms, avg {compute_output_time_ms / batch_count:.2f} ms \n"  # noqa
+                )
+def get_args():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "--server-addr",
+        type=str,
+        default="localhost",
+        help="Address of the server",
+    )
+    parser.add_argument(
+        "--server-port",
+        type=int,
+        default=8001,
+        help="Grpc port of the triton server, default is 8001",
+    )
+    parser.add_argument(
+        "--reference-audio",
+        type=str,
+        default=None,
+        help="Path to a single audio file. It can't be specified at the same time with --manifest-dir",
+    )
+    parser.add_argument(
+        "--reference-text",
+        type=str,
+        default="",
+        help="",
+    )
+    parser.add_argument(
+        "--target-text",
+        type=str,
+        default="",
+        help="",
+    )
+    parser.add_argument(
+        "--huggingface-dataset",
+        type=str,
+        default="yuekai/seed_tts",
+        help="dataset name in huggingface dataset hub",
+    )
+    parser.add_argument(
+        "--split-name",
+        type=str,
+        default="wenetspeech4tts",
+        choices=["wenetspeech4tts", "test_zh", "test_en", "test_hard"],
+        help="dataset split name, default is 'test'",
+    )
+    parser.add_argument(
+        "--manifest-path",
+        type=str,
+        default=None,
+        help="Path to the manifest dir which includes wav.scp trans.txt files.",
+    )
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="f5_tts",
+        choices=["f5_tts", "spark_tts"],
+        help="triton model_repo module name to request: transducer for k2, attention_rescoring for wenet offline, streaming_wenet for wenet streaming, infer_pipeline for paraformer large offline",
+    )
+    parser.add_argument(
+        "--num-tasks",
+        type=int,
+        default=1,
+        help="Number of concurrent tasks for sending",
+    )
+    parser.add_argument(
+        "--log-interval",
+        type=int,
+        default=5,
+        help="Controls how frequently we print the log.",
+    )
+    parser.add_argument(
+        "--compute-wer",
+        action="store_true",
+        default=False,
+        help="""True to compute WER.
+        """,
+    )
+    parser.add_argument(
+        "--log-dir",
+        type=str,
+        required=False,
+        default="./tmp",
+        help="log directory",
+    )
+    # --- Added arguments ---
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="offline",
+        choices=["offline", "streaming"],
+        help="Select offline or streaming benchmark mode."
+    )
+    parser.add_argument(
+        "--chunk-overlap-duration",
+        type=float,
+        default=0.1,
+        help="Chunk overlap duration for streaming reconstruction (in seconds)."
+    )
+    # --- End Added arguments ---
+    return parser.parse_args()
+def load_audio(wav_path, target_sample_rate=16000):
+    assert target_sample_rate == 16000, "hard coding in server"
+    if isinstance(wav_path, dict):
+        waveform = wav_path["array"]
+        sample_rate = wav_path["sampling_rate"]
+    else:
+        waveform, sample_rate = sf.read(wav_path)
+    if sample_rate != target_sample_rate:
+        from scipy.signal import resample
+        num_samples = int(len(waveform) * (target_sample_rate / sample_rate))
+        waveform = resample(waveform, num_samples)
+    return waveform, target_sample_rate
+def prepare_request_input_output(
+    protocol_client, # Can be grpcclient_aio or grpcclient_sync
+    waveform,
+    reference_text,
+    target_text,
+    sample_rate=16000,
+    padding_duration: int = None # Optional padding for offline mode
+):
+    """Prepares inputs for Triton inference (offline or streaming)."""
+    assert len(waveform.shape) == 1, "waveform should be 1D"
+    lengths = np.array([[len(waveform)]], dtype=np.int32)
+    # Apply padding only if padding_duration is provided (for offline)
+    if padding_duration:
+        duration = len(waveform) / sample_rate
+        # Estimate target duration based on text length ratio (crude estimation)
+        # Avoid division by zero if reference_text is empty
+        if reference_text:
+             estimated_target_duration = duration / len(reference_text) * len(target_text)
+        else:
+             estimated_target_duration = duration # Assume target duration similar to reference if no text
+        # Calculate required samples based on estimated total duration
+        required_total_samples = padding_duration * sample_rate * (
+            (int(estimated_target_duration + duration) // padding_duration) + 1
+        )
+        samples = np.zeros((1, required_total_samples), dtype=np.float32)
+        samples[0, : len(waveform)] = waveform
+    else:
+        # No padding for streaming or if padding_duration is None
+        samples = waveform.reshape(1, -1).astype(np.float32)
+    # Common input creation logic
+    inputs = [
+        protocol_client.InferInput("reference_wav", samples.shape, np_to_triton_dtype(samples.dtype)),
+        protocol_client.InferInput(
+            "reference_wav_len", lengths.shape, np_to_triton_dtype(lengths.dtype)
+        ),
+        protocol_client.InferInput("reference_text", [1, 1], "BYTES"),
+        protocol_client.InferInput("target_text", [1, 1], "BYTES"),
+    ]
+    inputs[0].set_data_from_numpy(samples)
+    inputs[1].set_data_from_numpy(lengths)
+    input_data_numpy = np.array([reference_text], dtype=object)
+    input_data_numpy = input_data_numpy.reshape((1, 1))
+    inputs[2].set_data_from_numpy(input_data_numpy)
+    input_data_numpy = np.array([target_text], dtype=object)
+    input_data_numpy = input_data_numpy.reshape((1, 1))
+    inputs[3].set_data_from_numpy(input_data_numpy)
+    outputs = [protocol_client.InferRequestedOutput("waveform")]
+    return inputs, outputs
+def run_sync_streaming_inference(
+    sync_triton_client: tritonclient.grpc.InferenceServerClient,
+    model_name: str,
+    inputs: list,
+    outputs: list,
+    request_id: str,
+    user_data: UserData,
+    chunk_overlap_duration: float,
+    save_sample_rate: int,
+    audio_save_path: str,
+):
+    """Helper function to run the blocking sync streaming call."""
+    start_time_total = time.time()
+    user_data.record_start_time() # Record start time for first chunk latency calculation
+    # Establish stream
+    sync_triton_client.start_stream(callback=functools.partial(callback, user_data))
+    # Send request
+    sync_triton_client.async_stream_infer(
+        model_name,
+        inputs,
+        request_id=request_id,
+        outputs=outputs,
+        enable_empty_final_response=True,
+    )
+    # Process results
+    audios = []
+    while True:
+        try:
+            result = user_data._completed_requests.get() # Add timeout
+            if isinstance(result, InferenceServerException):
+                print(f"Received InferenceServerException: {result}")
+                sync_triton_client.stop_stream()
+                return None, None, None # Indicate error
+            # Get response metadata
+            response = result.get_response()
+            final = response.parameters["triton_final_response"].bool_param
+            if final is True:
+                break
+            audio_chunk = result.as_numpy("waveform").reshape(-1)
+            if audio_chunk.size > 0: # Only append non-empty chunks
+                 audios.append(audio_chunk)
+            else:
+                print("Warning: received empty audio chunk.")
+        except queue.Empty:
+            print(f"Timeout waiting for response for request id {request_id}")
+            sync_triton_client.stop_stream()
+            return None, None, None # Indicate error
+    sync_triton_client.stop_stream()
+    end_time_total = time.time()
+    total_request_latency = end_time_total - start_time_total
+    first_chunk_latency = user_data.get_first_chunk_latency()
+    # Reconstruct audio using cross-fade (from client_grpc_streaming.py)
+    actual_duration = 0
+    if audios:
+        cross_fade_samples = int(chunk_overlap_duration * save_sample_rate)
+        fade_out = np.linspace(1, 0, cross_fade_samples)
+        fade_in = np.linspace(0, 1, cross_fade_samples)
+        reconstructed_audio = None
+        # Simplified reconstruction based on client_grpc_streaming.py
+        if not audios:
+            print("Warning: No audio chunks received.")
+            reconstructed_audio = np.array([], dtype=np.float32) # Empty array
+        elif len(audios) == 1:
+            reconstructed_audio = audios[0]
+        else:
+            reconstructed_audio = audios[0][:-cross_fade_samples] # Start with first chunk minus overlap
+            for i in range(1, len(audios)):
+                 # Cross-fade section
+                 cross_faded_overlap = (audios[i][:cross_fade_samples] * fade_in +
+                                        audios[i - 1][-cross_fade_samples:] * fade_out)
+                 # Middle section of the current chunk
+                 middle_part = audios[i][cross_fade_samples:-cross_fade_samples]
+                 # Concatenate
+                 reconstructed_audio = np.concatenate([reconstructed_audio, cross_faded_overlap, middle_part])
+            # Add the last part of the final chunk
+            reconstructed_audio = np.concatenate([reconstructed_audio, audios[-1][-cross_fade_samples:]])
+        if reconstructed_audio is not None and reconstructed_audio.size > 0:
+            actual_duration = len(reconstructed_audio) / save_sample_rate
+            # Save reconstructed audio
+            os.makedirs(os.path.dirname(audio_save_path), exist_ok=True)
+            sf.write(audio_save_path, reconstructed_audio, save_sample_rate, "PCM_16")
+        else:
+            print("Warning: No audio chunks received or reconstructed.")
+            actual_duration = 0 # Set duration to 0 if no audio
+    else:
+         print("Warning: No audio chunks received.")
+         actual_duration = 0
+    return total_request_latency, first_chunk_latency, actual_duration
+async def send_streaming(
+    manifest_item_list: list,
+    name: str,
+    server_url: str, # Changed from sync_triton_client
+    protocol_client: types.ModuleType,
+    log_interval: int,
+    model_name: str,
+    audio_save_dir: str = "./",
+    save_sample_rate: int = 16000,
+    chunk_overlap_duration: float = 0.1,
+    padding_duration: int = None,
+):
+    total_duration = 0.0
+    latency_data = []
+    task_id = int(name[5:])
+    sync_triton_client = None # Initialize client variable
+    try: # Wrap in try...finally to ensure client closing
+        print(f"{name}: Initializing sync client for streaming...")
+        sync_triton_client = grpcclient_sync.InferenceServerClient(url=server_url, verbose=False) # Create client here
+        print(f"{name}: Starting streaming processing for {len(manifest_item_list)} items.")
+        for i, item in enumerate(manifest_item_list):
+            if i % log_interval == 0:
+                print(f"{name}: Processing item {i}/{len(manifest_item_list)}")
+            try:
+                waveform, sample_rate = load_audio(item["audio_filepath"], target_sample_rate=16000)
+                reference_text, target_text = item["reference_text"], item["target_text"]
+                inputs, outputs = prepare_request_input_output(
+                    protocol_client,
+                    waveform,
+                    reference_text,
+                    target_text,
+                    sample_rate,
+                    padding_duration=padding_duration
+                )
+                request_id = str(uuid.uuid4())
+                user_data = UserData()
+                audio_save_path = os.path.join(audio_save_dir, f"{item['target_audio_path']}.wav")
+                total_request_latency, first_chunk_latency, actual_duration = await asyncio.to_thread(
+                    run_sync_streaming_inference,
+                    sync_triton_client,
+                    model_name,
+                    inputs,
+                    outputs,
+                    request_id,
+                    user_data,
+                    chunk_overlap_duration,
+                    save_sample_rate,
+                    audio_save_path
+                )
+                if total_request_latency is not None:
+                    print(f"{name}: Item {i} - First Chunk Latency: {first_chunk_latency:.4f}s, Total Latency: {total_request_latency:.4f}s, Duration: {actual_duration:.4f}s")
+                    latency_data.append((total_request_latency, first_chunk_latency, actual_duration))
+                    total_duration += actual_duration
+                else:
+                     print(f"{name}: Item {i} failed.")
+            except FileNotFoundError:
+                print(f"Error: Audio file not found for item {i}: {item['audio_filepath']}")
+            except Exception as e:
+                print(f"Error processing item {i} ({item['target_audio_path']}): {e}")
+                import traceback
+                traceback.print_exc()
+    finally: # Ensure client is closed
+        if sync_triton_client:
+            try:
+                print(f"{name}: Closing sync client...")
+                sync_triton_client.close()
+            except Exception as e:
+                print(f"{name}: Error closing sync client: {e}")
+    print(f"{name}: Finished streaming processing. Total duration synthesized: {total_duration:.4f}s")
+    return total_duration, latency_data
+async def send(
+    manifest_item_list: list,
+    name: str,
+    triton_client: tritonclient.grpc.aio.InferenceServerClient,
+    protocol_client: types.ModuleType,
+    log_interval: int,
+    model_name: str,
+    padding_duration: int = None,
+    audio_save_dir: str = "./",
+    save_sample_rate: int = 16000,
+):
+    total_duration = 0.0
+    latency_data = []
+    task_id = int(name[5:])
+    print(f"manifest_item_list: {manifest_item_list}")
+    for i, item in enumerate(manifest_item_list):
+        if i % log_interval == 0:
+            print(f"{name}: {i}/{len(manifest_item_list)}")
+        waveform, sample_rate = load_audio(item["audio_filepath"], target_sample_rate=16000)
+        reference_text, target_text = item["reference_text"], item["target_text"]
+        inputs, outputs = prepare_request_input_output(
+            protocol_client,
+            waveform,
+            reference_text,
+            target_text,
+            sample_rate,
+            padding_duration=padding_duration
+        )
+        sequence_id = 100000000 + i + task_id * 10
+        start = time.time()
+        response = await triton_client.infer(model_name, inputs, request_id=str(sequence_id), outputs=outputs)
+        audio = response.as_numpy("waveform").reshape(-1)
+        actual_duration = len(audio) / save_sample_rate
+        end = time.time() - start
+        audio_save_path = os.path.join(audio_save_dir, f"{item['target_audio_path']}.wav")
+        sf.write(audio_save_path, audio, save_sample_rate, "PCM_16")
+        latency_data.append((end, actual_duration))
+        total_duration += actual_duration
+    return total_duration, latency_data
+def load_manifests(manifest_path):
+    with open(manifest_path, "r") as f:
+        manifest_list = []
+        for line in f:
+            assert len(line.strip().split("|")) == 4
+            utt, prompt_text, prompt_wav, gt_text = line.strip().split("|")
+            utt = Path(utt).stem
+            # gt_wav = os.path.join(os.path.dirname(manifest_path), "wavs", utt + ".wav")
+            if not os.path.isabs(prompt_wav):
+                prompt_wav = os.path.join(os.path.dirname(manifest_path), prompt_wav)
+            manifest_list.append(
+                {
+                    "audio_filepath": prompt_wav,
+                    "reference_text": prompt_text,
+                    "target_text": gt_text,
+                    "target_audio_path": utt,
+                }
+            )
+    return manifest_list
+def split_data(data, k):
+    n = len(data)
+    if n < k:
+        print(f"Warning: the length of the input list ({n}) is less than k ({k}). Setting k to {n}.")
+        k = n
+    quotient = n // k
+    remainder = n % k
+    result = []
+    start = 0
+    for i in range(k):
+        if i < remainder:
+            end = start + quotient + 1
+        else:
+            end = start + quotient
+        result.append(data[start:end])
+        start = end
+    return result
+async def main():
+    args = get_args()
+    url = f"{args.server_addr}:{args.server_port}"
+    # --- Client Initialization based on mode ---
+    triton_client = None
+    protocol_client = None
+    if args.mode == "offline":
+        print("Initializing gRPC client for offline mode...")
+        # Use the async client for offline tasks
+        triton_client = grpcclient_aio.InferenceServerClient(url=url, verbose=False)
+        protocol_client = grpcclient_aio
+    elif args.mode == "streaming":
+        print("Initializing gRPC client for streaming mode...")
+        # Use the sync client for streaming tasks, handled via asyncio.to_thread
+        # We will create one sync client instance PER TASK inside send_streaming.
+        # triton_client = grpcclient_sync.InferenceServerClient(url=url, verbose=False) # REMOVED: Client created per task now
+        protocol_client = grpcclient_sync # protocol client for input prep
+    else:
+        raise ValueError(f"Invalid mode: {args.mode}")
+    # --- End Client Initialization ---
+    if args.reference_audio:
+        args.num_tasks = 1
+        args.log_interval = 1
+        manifest_item_list = [
+            {
+                "reference_text": args.reference_text,
+                "target_text": args.target_text,
+                "audio_filepath": args.reference_audio,
+                "target_audio_path": "test",
+            }
+        ]
+    elif args.huggingface_dataset:
+        import datasets
+        dataset = datasets.load_dataset(
+            args.huggingface_dataset,
+            split=args.split_name,
+            trust_remote_code=True,
+        )
+        manifest_item_list = []
+        for i in range(len(dataset)):
+            manifest_item_list.append(
+                {
+                    "audio_filepath": dataset[i]["prompt_audio"],
+                    "reference_text": dataset[i]["prompt_text"],
+                    "target_audio_path": dataset[i]["id"],
+                    "target_text": dataset[i]["target_text"],
+                }
+            )
+    else:
+        manifest_item_list = load_manifests(args.manifest_path)
+    num_tasks = min(args.num_tasks, len(manifest_item_list))
+    manifest_item_list = split_data(manifest_item_list, num_tasks)
+    os.makedirs(args.log_dir, exist_ok=True)
+    tasks = []
+    start_time = time.time()
+    for i in range(num_tasks):
+        # --- Task Creation based on mode ---
+        if args.mode == "offline":
+            task = asyncio.create_task(
+                send(
+                    manifest_item_list[i],
+                    name=f"task-{i}",
+                    triton_client=triton_client,
+                    protocol_client=protocol_client,
+                    log_interval=args.log_interval,
+                    model_name=args.model_name,
+                    audio_save_dir=args.log_dir,
+                    padding_duration=1,
+                    save_sample_rate=24000 if args.model_name == "f5_tts" else 16000,
+                )
+            )
+        elif args.mode == "streaming":
+             task = asyncio.create_task(
+                send_streaming(
+                    manifest_item_list[i],
+                    name=f"task-{i}",
+                    server_url=url, # Pass URL instead of client
+                    protocol_client=protocol_client,
+                    log_interval=args.log_interval,
+                    model_name=args.model_name,
+                    audio_save_dir=args.log_dir,
+                    padding_duration=10,
+                    save_sample_rate=24000 if args.model_name == "f5_tts" else 16000,
+                    chunk_overlap_duration=args.chunk_overlap_duration,
+                )
+            )
+        # --- End Task Creation ---
+        tasks.append(task)
+    ans_list = await asyncio.gather(*tasks)
+    end_time = time.time()
+    elapsed = end_time - start_time
+    total_duration = 0.0
+    latency_data = []
+    for ans in ans_list:
+        if ans:
+            total_duration += ans[0]
+            latency_data.extend(ans[1]) # Use extend for list of lists
+        else:
+             print("Warning: A task returned None, possibly due to an error.")
+    if total_duration == 0:
+        print("Total synthesized duration is zero. Cannot calculate RTF or latency percentiles.")
+        rtf = float('inf')
+    else:
+         rtf = elapsed / total_duration
+    s = f"Mode: {args.mode}\n"
+    s += f"RTF: {rtf:.4f}\n"
+    s += f"total_duration: {total_duration:.3f} seconds\n"
+    s += f"({total_duration / 3600:.2f} hours)\n"
+    s += f"processing time: {elapsed:.3f} seconds ({elapsed / 3600:.2f} hours)\n"
+    # --- Statistics Reporting based on mode ---
+    if latency_data:
+        if args.mode == "offline":
+            # Original offline latency calculation
+            latency_list = [chunk_end for (chunk_end, chunk_duration) in latency_data]
+            if latency_list:
+                latency_ms = sum(latency_list) / float(len(latency_list)) * 1000.0
+                latency_variance = np.var(latency_list, dtype=np.float64) * 1000.0
+                s += f"latency_variance: {latency_variance:.2f}\n"
+                s += f"latency_50_percentile_ms: {np.percentile(latency_list, 50) * 1000.0:.2f}\n"
+                s += f"latency_90_percentile_ms: {np.percentile(latency_list, 90) * 1000.0:.2f}\n"
+                s += f"latency_95_percentile_ms: {np.percentile(latency_list, 95) * 1000.0:.2f}\n"
+                s += f"latency_99_percentile_ms: {np.percentile(latency_list, 99) * 1000.0:.2f}\n"
+                s += f"average_latency_ms: {latency_ms:.2f}\n"
+            else:
+                s += "No latency data collected for offline mode.\n"
+        elif args.mode == "streaming":
+            # Calculate stats for total request latency and first chunk latency
+            total_latency_list = [total for (total, first, duration) in latency_data if total is not None]
+            first_chunk_latency_list = [first for (total, first, duration) in latency_data if first is not None]
+            s += "\n--- Total Request Latency ---\n"
+            if total_latency_list:
+                avg_total_latency_ms = sum(total_latency_list) / len(total_latency_list) * 1000.0
+                variance_total_latency = np.var(total_latency_list, dtype=np.float64) * 1000.0
+                s += f"total_request_latency_variance: {variance_total_latency:.2f}\n"
+                s += f"total_request_latency_50_percentile_ms: {np.percentile(total_latency_list, 50) * 1000.0:.2f}\n"
+                s += f"total_request_latency_90_percentile_ms: {np.percentile(total_latency_list, 90) * 1000.0:.2f}\n"
+                s += f"total_request_latency_95_percentile_ms: {np.percentile(total_latency_list, 95) * 1000.0:.2f}\n"
+                s += f"total_request_latency_99_percentile_ms: {np.percentile(total_latency_list, 99) * 1000.0:.2f}\n"
+                s += f"average_total_request_latency_ms: {avg_total_latency_ms:.2f}\n"
+            else:
+                 s += "No total request latency data collected.\n"
+            s += "\n--- First Chunk Latency ---\n"
+            if first_chunk_latency_list:
+                avg_first_chunk_latency_ms = sum(first_chunk_latency_list) / len(first_chunk_latency_list) * 1000.0
+                variance_first_chunk_latency = np.var(first_chunk_latency_list, dtype=np.float64) * 1000.0
+                s += f"first_chunk_latency_variance: {variance_first_chunk_latency:.2f}\n"
+                s += f"first_chunk_latency_50_percentile_ms: {np.percentile(first_chunk_latency_list, 50) * 1000.0:.2f}\n"
+                s += f"first_chunk_latency_90_percentile_ms: {np.percentile(first_chunk_latency_list, 90) * 1000.0:.2f}\n"
+                s += f"first_chunk_latency_95_percentile_ms: {np.percentile(first_chunk_latency_list, 95) * 1000.0:.2f}\n"
+                s += f"first_chunk_latency_99_percentile_ms: {np.percentile(first_chunk_latency_list, 99) * 1000.0:.2f}\n"
+                s += f"average_first_chunk_latency_ms: {avg_first_chunk_latency_ms:.2f}\n"
+            else:
+                 s += "No first chunk latency data collected (check for errors or if all requests failed before first chunk).\n"
+    else:
+        s += "No latency data collected.\n"
+    # --- End Statistics Reporting ---
+    print(s)
+    if args.manifest_path:
+        name = Path(args.manifest_path).stem
+    elif args.split_name:
+        name = args.split_name
+    elif args.reference_audio:
+        name = Path(args.reference_audio).stem
+    else:
+        name = "results" # Default name if no manifest/split/audio provided
+    with open(f"{args.log_dir}/rtf-{name}.txt", "w") as f:
+        f.write(s)
+    # --- Statistics Fetching using temporary Async Client ---
+    # Use a separate async client for fetching stats regardless of mode
+    stats_client = None
+    try:
+        print("Initializing temporary async client for fetching stats...")
+        stats_client = grpcclient_aio.InferenceServerClient(url=url, verbose=False)
+        print("Fetching inference statistics...")
+        # Fetching for all models, filtering might be needed depending on server setup
+        stats = await stats_client.get_inference_statistics(model_name="", as_json=True)
+        print("Fetching model config...")
+        metadata = await stats_client.get_model_config(model_name=args.model_name, as_json=True)
+        write_triton_stats(stats, f"{args.log_dir}/stats_summary-{name}.txt")
+        with open(f"{args.log_dir}/model_config-{name}.json", "w") as f:
+            json.dump(metadata, f, indent=4)
+    except Exception as e:
+        print(f"Could not retrieve statistics or config: {e}")
+    finally:
+        if stats_client:
+            try:
+                print("Closing temporary async stats client...")
+                await stats_client.close()
+            except Exception as e:
+                print(f"Error closing async stats client: {e}")
+    # --- End Statistics Fetching ---
+if __name__ == "__main__":
+    # asyncio.run(main()) # Use TaskGroup for better exception handling if needed
+    async def run_main():
+        try:
+            await main()
+        except Exception as e:
+            print(f"An error occurred in main: {e}")
+            import traceback
+            traceback.print_exc()
+    asyncio.run(run_main())

runtime/triton_trtllm/client_http.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import requests
+import soundfile as sf
+import json
+import numpy as np
+import argparse
+def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--server-url",
+        type=str,
+        default="localhost:8000",
+        help="Address of the server",
+    )
+    parser.add_argument(
+        "--reference-audio",
+        type=str,
+        default="../../example/prompt_audio.wav",
+        help="Path to a single audio file. It can't be specified at the same time with --manifest-dir",
+    )
+    parser.add_argument(
+        "--reference-text",
+        type=str,
+        default="吃燕窝就选燕之屋，本节目由26年专注高品质燕窝的燕之屋冠名播出。豆奶牛奶换着喝，营养更均衡，本节目由豆本豆豆奶特约播出。",
+        help="",
+    )
+    parser.add_argument(
+        "--target-text",
+        type=str,
+        default="身临其境，换新体验。塑造开源语音合成新范式，让智能语音更自然。",
+        help="",
+    )
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="spark_tts",
+        choices=[
+            "f5_tts", "spark_tts"
+        ],
+        help="triton model_repo module name to request: transducer for k2, attention_rescoring for wenet offline, streaming_wenet for wenet streaming, infer_pipeline for paraformer large offline",
+    )
+    parser.add_argument(
+        "--output-audio",
+        type=str,
+        default="output.wav",
+        help="Path to save the output audio",
+    )
+    return parser.parse_args()
+def prepare_request(
+    waveform,
+    reference_text,
+    target_text,
+    sample_rate=16000,
+    padding_duration: int = None,
+    audio_save_dir: str = "./",
+):
+    assert len(waveform.shape) == 1, "waveform should be 1D"
+    lengths = np.array([[len(waveform)]], dtype=np.int32)
+    if padding_duration:
+        # padding to nearset 10 seconds
+        samples = np.zeros(
+            (
+                1,
+                padding_duration
+                * sample_rate
+                * ((int(duration) // padding_duration) + 1),
+            ),
+            dtype=np.float32,
+        )
+        samples[0, : len(waveform)] = waveform
+    else:
+        samples = waveform
+    samples = samples.reshape(1, -1).astype(np.float32)
+    data = {
+        "inputs":[
+            {
+                "name": "reference_wav",
+                "shape": samples.shape,
+                "datatype": "FP32",
+                "data": samples.tolist()
+            },
+            {
+                "name": "reference_wav_len",
+                "shape": lengths.shape,
+                "datatype": "INT32",
+                "data": lengths.tolist(),
+            },
+            {
+                "name": "reference_text",
+                "shape": [1, 1],
+                "datatype": "BYTES",
+                "data": [reference_text]
+            },
+            {
+                "name": "target_text",
+                "shape": [1, 1],
+                "datatype": "BYTES",
+                "data": [target_text]
+            }
+        ]
+    }
+    return data
+if __name__ == "__main__":
+    args = get_args()
+    server_url = args.server_url
+    if not server_url.startswith(("http://", "https://")):
+        server_url = f"http://{server_url}"
+    url = f"{server_url}/v2/models/{args.model_name}/infer"
+    waveform, sr = sf.read(args.reference_audio)
+    assert sr == 16000, "sample rate hardcoded in server"
+    samples = np.array(waveform, dtype=np.float32)
+    data = prepare_request(samples, args.reference_text, args.target_text)
+    rsp = requests.post(
+        url,
+        headers={"Content-Type": "application/json"},
+        json=data,
+        verify=False,
+        params={"request_id": '0'}
+    )
+    result = rsp.json()
+    audio = result["outputs"][0]["data"]
+    audio = np.array(audio, dtype=np.float32)
+    sf.write(args.output_audio, audio, 16000, "PCM_16")

runtime/triton_trtllm/docker-compose.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+services:
+  tts:
+    image: soar97/triton-spark-tts:25.02
+    shm_size: '1gb'
+    ports:
+      - "8000:8000"
+      - "8001:8001"
+      - "8002:8002"
+    environment:
+      - PYTHONIOENCODING=utf-8
+      - MODEL_ID=${MODEL_ID}
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ['0']
+              capabilities: [gpu]
+    command: >
+      /bin/bash -c "rm -rf Spark-TTS && git clone https://github.com/SparkAudio/Spark-TTS.git && cd Spark-TTS/runtime/triton_trtllm && bash run.sh 0 3"

runtime/triton_trtllm/model_repo/audio_tokenizer/1/model.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import json
+import torch
+from torch.utils.dlpack import to_dlpack
+import triton_python_backend_utils as pb_utils
+import os
+import numpy as np
+from sparktts.models.audio_tokenizer import BiCodecTokenizer
+class TritonPythonModel:
+    """Triton Python model for audio tokenization.
+    This model takes reference audio input and extracts semantic and global tokens
+    using BiCodec tokenizer.
+    """
+    def initialize(self, args):
+        """Initialize the model.
+        Args:
+            args: Dictionary containing model configuration
+        """
+        # Parse model parameters
+        parameters = json.loads(args['model_config'])['parameters']
+        model_params = {k: v["string_value"] for k, v in parameters.items()}
+        # Initialize tokenizer
+        self.device = torch.device("cuda")
+        self.audio_tokenizer = BiCodecTokenizer(model_params["model_dir"],
+                                              device=self.device)
+    def get_ref_clip(self, wav: np.ndarray) -> np.ndarray:
+        """Extract reference audio clip for speaker embedding.
+        Args:
+            wav: Input waveform array
+        Returns:
+            Reference clip of fixed duration
+        """
+        SAMPLE_RATE = 16000
+        REF_SEGMENT_DURATION = 6  # seconds
+        LATENT_HOP_LENGTH = 320
+        ref_segment_length = (
+            int(SAMPLE_RATE * REF_SEGMENT_DURATION)
+            // LATENT_HOP_LENGTH
+            * LATENT_HOP_LENGTH
+        )
+        wav_length = len(wav)
+        if ref_segment_length > wav_length:
+            # Repeat and truncate if input is too short
+            repeat_times = ref_segment_length // wav_length + 1
+            wav = np.tile(wav, repeat_times)
+        return wav[:ref_segment_length]
+    def execute(self, requests):
+        """Execute inference on the batched requests.
+        Args:
+            requests: List of inference requests
+        Returns:
+            List of inference responses containing tokenized outputs
+        """
+        reference_wav_list = []
+        reference_wav_ref_clip_list = []
+        # Process each request in batch
+        for request in requests:
+            # Extract input tensors
+            wav_array = pb_utils.get_input_tensor_by_name(
+                request, "reference_wav").as_numpy()
+            wav_len = pb_utils.get_input_tensor_by_name(
+                request, "reference_wav_len").as_numpy().item()
+            # Prepare inputs
+            wav = wav_array[:, :wav_len].squeeze(0)
+            reference_wav_list.append(wav)
+            wav_ref_clip = self.get_ref_clip(wav)
+            reference_wav_ref_clip_list.append(torch.from_numpy(wav_ref_clip))
+        # Batch process through tokenizer
+        ref_wav_clip_tensor = torch.stack(reference_wav_ref_clip_list, dim=0)
+        wav2vec2_features = self.audio_tokenizer.extract_wav2vec2_features(
+            reference_wav_list)
+        audio_tokenizer_input = {
+            "ref_wav": ref_wav_clip_tensor.to(self.device),
+            "feat": wav2vec2_features.to(self.device),
+        }
+        semantic_tokens, global_tokens = self.audio_tokenizer.model.tokenize(
+            audio_tokenizer_input)
+        # Prepare responses
+        responses = []
+        for i in range(len(requests)):
+            global_tokens_tensor = pb_utils.Tensor.from_dlpack(
+                "global_tokens", to_dlpack(global_tokens[i]))
+            semantic_tokens_tensor = pb_utils.Tensor.from_dlpack(
+                "semantic_tokens", to_dlpack(semantic_tokens[i]))
+            inference_response = pb_utils.InferenceResponse(
+                output_tensors=[global_tokens_tensor, semantic_tokens_tensor])
+            responses.append(inference_response)
+        return responses

runtime/triton_trtllm/model_repo/audio_tokenizer/config.pbtxt ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "audio_tokenizer"
+backend: "python"
+max_batch_size: ${triton_max_batch_size}
+dynamic_batching {
+    max_queue_delay_microseconds: ${max_queue_delay_microseconds}
+}
+parameters [
+  {
+   key: "model_dir",
+   value: {string_value:"${model_dir}"}
+  }
+]
+input [
+  {
+    name: "reference_wav"
+    data_type: TYPE_FP32
+    dims: [-1]
+  },
+  {
+    name: "reference_wav_len"
+    data_type: TYPE_INT32
+    dims: [1]
+  }
+]
+output [
+  {
+    name: "global_tokens"
+    data_type: TYPE_INT32
+    dims: [-1]
+  },
+  {
+    name: "semantic_tokens"
+    data_type: TYPE_INT32
+    dims: [-1]
+  }
+]
+instance_group [
+  {
+    count: 1
+    kind: KIND_CPU
+  }
+]

runtime/triton_trtllm/model_repo/spark_tts/1/model.py ADDED Viewed

	@@ -0,0 +1,404 @@

+# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import json
+import math
+import os
+import re
+from typing import Dict, List, Tuple, Optional, Union
+import numpy as np
+import torch
+from torch.utils.dlpack import from_dlpack, to_dlpack
+import triton_python_backend_utils as pb_utils
+from transformers import AutoTokenizer
+from sparktts.utils.token_parser import TASK_TOKEN_MAP
+def process_prompt(
+    text: str,
+    prompt_text: Optional[str] = None,
+    global_token_ids: torch.Tensor = None,
+    semantic_token_ids: torch.Tensor = None,
+) -> Tuple[str, torch.Tensor]:
+    """
+    Process input for voice cloning.
+    Args:
+        text: The text input to be converted to speech.
+        prompt_text: Transcript of the prompt audio.
+        global_token_ids: Global token IDs extracted from reference audio.
+        semantic_token_ids: Semantic token IDs extracted from reference audio.
+    Returns:
+        Tuple containing the formatted input prompt and global token IDs.
+    """
+    # Convert global tokens to string format
+    global_tokens = "".join(
+        [f"<|bicodec_global_{i}|>" for i in global_token_ids.squeeze()]
+    )
+    # Prepare the input tokens for the model
+    if prompt_text is not None:
+        # Include semantic tokens when prompt text is provided
+        semantic_tokens = "".join(
+            [f"<|bicodec_semantic_{i}|>" for i in semantic_token_ids.squeeze()]
+        )
+        inputs = [
+            TASK_TOKEN_MAP["tts"],
+            "<|start_content|>",
+            prompt_text,
+            text,
+            "<|end_content|>",
+            "<|start_global_token|>",
+            global_tokens,
+            "<|end_global_token|>",
+            "<|start_semantic_token|>",
+            semantic_tokens,
+        ]
+    else:
+        # Without prompt text, exclude semantic tokens
+        inputs = [
+            TASK_TOKEN_MAP["tts"],
+            "<|start_content|>",
+            text,
+            "<|end_content|>",
+            "<|start_global_token|>",
+            global_tokens,
+            "<|end_global_token|>",
+        ]
+    # Join all input components into a single string
+    inputs = "".join(inputs)
+    return inputs, global_token_ids
+class TritonPythonModel:
+    """Triton Python model for Spark TTS.
+    This model orchestrates the end-to-end TTS pipeline by coordinating
+    between audio tokenizer, LLM, and vocoder components.
+    """
+    def initialize(self, args):
+        """Initialize the model.
+        Args:
+            args: Dictionary containing model configuration
+        """
+        self.logger = pb_utils.Logger
+        # Parse model parameters
+        self.model_config = json.loads(args['model_config'])
+        parameters = self.model_config['parameters']
+        model_params = {k: v["string_value"] for k, v in parameters.items()}
+        self.logger.log_info(f"model_params:{model_params}")
+        # streaming TTS parameters
+        assert (
+            float(model_params["audio_chunk_duration"]) >= 0.5
+        ), f"audio_chunk_duration at least 0.5 seconds"
+        self.audio_chunk_duration = float(model_params["audio_chunk_duration"])
+        self.max_audio_chunk_duration = float(model_params["max_audio_chunk_duration"])
+        assert (
+            float(model_params["audio_chunk_size_scale_factor"]) >= 1.0
+        ), "audio_chunk_size_scale_factor should be greater than 1, change it according to your actual rtf"
+        self.audio_chunk_size_scale_factor = float(model_params["audio_chunk_size_scale_factor"])  # scale speed
+        self.audio_chunk_overlap_duration = float(model_params["audio_chunk_overlap_duration"])
+        self.audio_tokenizer_frame_rate = int(model_params["audio_tokenizer_frame_rate"])
+        # Initialize tokenizer
+        llm_tokenizer_dir = model_params["llm_tokenizer_dir"]
+        self.tokenizer = AutoTokenizer.from_pretrained(llm_tokenizer_dir)
+        self.device = torch.device("cuda")
+        self.decoupled = pb_utils.using_decoupled_model_transaction_policy(self.model_config)
+    def forward_llm(self, input_ids):
+        """
+        Prepares the response from the language model based on the provided
+        inputs. Creates a `pb_utils.InferenceRequest` object with passed
+        `llm_request_inputs` to send to a decoupled TensorRTLLM model.
+        For each response from the language model:
+            - Checks for errors and raise an exception if any are found.
+            - Extracts the "output_ids" tensor from the response.
+            - Determines the finish reason based on the presence of the
+              end-of-sequence token or reaching the maximum length.
+            - Appends the generated token IDs to `output_ids`.
+            - If the finish reason is determined, decodes the output IDs to text
+              and prepares the final response.
+        The final response includes the generated text, finish reason,
+        completion tokens, prompt tokens, and total tokens.
+        Parameters
+        ----------
+        - llm_request_inputs (dict): A dictionary containing the inputs for the language model.
+        Returns
+        -------
+        - pb_utils.InferenceResponse: The response object containing the generated text and additional metadata.
+        """
+        # convert input_ids to numpy, with shape [1, sequence_length]
+        input_ids = input_ids.cpu().numpy()
+        max_tokens = 512
+        input_dict = {
+            "request_output_len": np.array([[max_tokens]], dtype=np.int32),
+            "end_id": np.array([[self.tokenizer.eos_token_id]], dtype=np.int32),
+            "pad_id": np.array([[self.tokenizer.pad_token_id]], dtype=np.int32),
+            "streaming": np.array([[self.decoupled]], dtype=np.bool_),
+            "runtime_top_p": np.array([[0.95]], dtype=np.float32),
+            "runtime_top_k": np.array([[50]], dtype=np.int32),
+            "temperature": np.array([[0.8]], dtype=np.float32),
+            "input_ids": input_ids,
+            "input_lengths": np.array([[input_ids.shape[1]]], dtype=np.int32),
+        }
+        # Convert inputs to Triton tensors
+        input_tensor_list = [
+            pb_utils.Tensor(k, v) for k, v in input_dict.items()
+        ]
+        # Create and execute inference request
+        llm_request = pb_utils.InferenceRequest(
+            model_name="tensorrt_llm",
+            requested_output_names=["output_ids", "sequence_length"],
+            inputs=input_tensor_list,
+        )
+        llm_responses = llm_request.exec(decoupled=self.decoupled)
+        if self.decoupled:
+            for llm_response in llm_responses:
+                if llm_response.has_error():
+                    raise pb_utils.TritonModelException(llm_response.error().message())
+                # Extract and process output
+                output_ids = pb_utils.get_output_tensor_by_name(
+                    llm_response, "output_ids").as_numpy()
+                seq_lens = pb_utils.get_output_tensor_by_name(
+                    llm_response, "sequence_length").as_numpy()
+                # Get actual output IDs up to the sequence length
+                actual_output_ids = output_ids[0][0][:seq_lens[0][0]]
+                yield actual_output_ids
+        else:
+            llm_response = llm_responses
+            if llm_response.has_error():
+                raise pb_utils.TritonModelException(llm_response.error().message())
+            # Extract and process output
+            output_ids = pb_utils.get_output_tensor_by_name(
+                llm_response, "output_ids").as_numpy()
+            seq_lens = pb_utils.get_output_tensor_by_name(
+                llm_response, "sequence_length").as_numpy()
+            # Get actual output IDs up to the sequence length
+            actual_output_ids = output_ids[0][0][:seq_lens[0][0]]
+            yield actual_output_ids
+    def forward_audio_tokenizer(self, wav, wav_len):
+        """Forward pass through the audio tokenizer component.
+        Args:
+            wav: Input waveform tensor
+            wav_len: Waveform length tensor
+        Returns:
+            Tuple of global and semantic tokens
+        """
+        inference_request = pb_utils.InferenceRequest(
+            model_name='audio_tokenizer',
+            requested_output_names=['global_tokens', 'semantic_tokens'],
+            inputs=[wav, wav_len]
+        )
+        inference_response = inference_request.exec()
+        if inference_response.has_error():
+            raise pb_utils.TritonModelException(inference_response.error().message())
+        # Extract and convert output tensors
+        global_tokens = pb_utils.get_output_tensor_by_name(inference_response, 'global_tokens')
+        global_tokens = torch.utils.dlpack.from_dlpack(global_tokens.to_dlpack()).cpu()
+        semantic_tokens = pb_utils.get_output_tensor_by_name(inference_response, 'semantic_tokens')
+        semantic_tokens = torch.utils.dlpack.from_dlpack(semantic_tokens.to_dlpack()).cpu()
+        return global_tokens, semantic_tokens
+    def forward_vocoder(self, global_token_ids: torch.Tensor, pred_semantic_ids: torch.Tensor) -> torch.Tensor:
+        """Forward pass through the vocoder component.
+        Args:
+            global_token_ids: Global token IDs tensor
+            pred_semantic_ids: Predicted semantic token IDs tensor
+        Returns:
+            Generated waveform tensor
+        """
+        # Convert tensors to Triton format
+        global_token_ids_tensor = pb_utils.Tensor.from_dlpack("global_tokens", to_dlpack(global_token_ids))
+        pred_semantic_ids_tensor = pb_utils.Tensor.from_dlpack("semantic_tokens", to_dlpack(pred_semantic_ids))
+        # Create and execute inference request
+        inference_request = pb_utils.InferenceRequest(
+            model_name='vocoder',
+            requested_output_names=['waveform'],
+            inputs=[global_token_ids_tensor, pred_semantic_ids_tensor]
+        )
+        inference_response = inference_request.exec()
+        if inference_response.has_error():
+            raise pb_utils.TritonModelException(inference_response.error().message())
+        # Extract and convert output waveform
+        waveform = pb_utils.get_output_tensor_by_name(inference_response, 'waveform')
+        waveform = torch.utils.dlpack.from_dlpack(waveform.to_dlpack()).cpu()
+        return waveform
+    def token2wav(self, generated_token_ids, global_token_ids):
+        # Decode and extract semantic token IDs from generated text
+        predicted_text = self.tokenizer.batch_decode(
+            [generated_token_ids],
+            skip_special_tokens=True,
+        )[0]
+        pred_semantic_ids = (
+            torch.tensor(
+                [int(token) for token in re.findall(r"bicodec_semantic_(\d+)", predicted_text)]
+            )
+            .unsqueeze(0)
+            .to(torch.int32)
+        )
+        # Generate audio with vocoder
+        audio = self.forward_vocoder(
+            global_token_ids.to(self.device),
+            pred_semantic_ids.to(self.device),
+        )
+        return audio
+    def execute(self, requests):
+        """Execute inference on the batched requests.
+        Args:
+            requests: List of inference requests
+        Returns:
+            List of inference responses containing generated audio
+        """
+        responses = []
+        for request in requests:
+            # Extract input tensors
+            wav = pb_utils.get_input_tensor_by_name(request, "reference_wav")
+            wav_len = pb_utils.get_input_tensor_by_name(request, "reference_wav_len")
+            # Process reference audio through audio tokenizer
+            global_tokens, semantic_tokens = self.forward_audio_tokenizer(wav, wav_len)
+            # Extract text inputs
+            reference_text = pb_utils.get_input_tensor_by_name(request, "reference_text").as_numpy()
+            reference_text = reference_text[0][0].decode('utf-8')
+            target_text = pb_utils.get_input_tensor_by_name(request, "target_text").as_numpy()
+            target_text = target_text[0][0].decode('utf-8')
+            # Prepare prompt for LLM
+            prompt, global_token_ids = process_prompt(
+                text=target_text,
+                prompt_text=reference_text,
+                global_token_ids=global_tokens,
+                semantic_token_ids=semantic_tokens,
+            )
+            # Tokenize prompt for LLM
+            model_inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
+            input_ids = model_inputs.input_ids.to(torch.int32)
+            # Generate semantic tokens with LLM
+            generated_ids_iter = self.forward_llm(input_ids)
+            if self.decoupled:
+                response_sender = request.get_response_sender()
+                request_id = request.request_id()
+                semantic_token_ids_arr = []
+                max_chunk_size = math.ceil(self.max_audio_chunk_duration * self.audio_tokenizer_frame_rate)
+                chunk_size = math.ceil(self.audio_chunk_duration * self.audio_tokenizer_frame_rate)
+                overlap_chunk_size = math.ceil(self.audio_chunk_overlap_duration * self.audio_tokenizer_frame_rate)
+                self.logger.log_info(
+                    f"[{request_id}] init chunk_size: {chunk_size} max_chunk_size: {max_chunk_size}"
+                )
+                for generated_ids in generated_ids_iter:
+                    if generated_ids is None or len(generated_ids) == 0:
+                        break
+                    semantic_token_ids_arr.append(generated_ids)
+                    if len(semantic_token_ids_arr) >= chunk_size:
+                        chunk = semantic_token_ids_arr[:chunk_size]
+                        generated_semantic_token_ids = np.hstack(chunk)
+                        # Process each chunk
+                        sub_tts_speech = self.token2wav(generated_semantic_token_ids, global_token_ids)
+                        # Prepare response to send
+                        audio_tensor = pb_utils.Tensor.from_dlpack("waveform", to_dlpack(sub_tts_speech))
+                        inference_response = pb_utils.InferenceResponse(output_tensors=[audio_tensor])
+                        response_sender.send(inference_response)
+                        semantic_token_ids_arr = semantic_token_ids_arr[chunk_size - overlap_chunk_size:]
+                        # increase chunk size for better speech quality
+                        chunk_size = min(max_chunk_size, int(chunk_size * self.audio_chunk_size_scale_factor))
+                        self.logger.log_info(f"[{request_id}] increase chunk_size: {chunk_size}")
+                if len(semantic_token_ids_arr) > 0:  # end to finalize
+                    generated_semantic_token_ids = np.hstack(semantic_token_ids_arr)
+                    # Process each chunk
+                    sub_tts_speech = self.token2wav(generated_semantic_token_ids, global_token_ids)
+                    # Prepare response to send
+                    audio_tensor = pb_utils.Tensor.from_dlpack("waveform", to_dlpack(sub_tts_speech))
+                    inference_response = pb_utils.InferenceResponse(output_tensors=[audio_tensor])
+                    response_sender.send(inference_response)
+                    self.logger.log_info(f"[{request_id}] last chunk len: {len(semantic_token_ids_arr)}")
+            else:
+                generated_ids = next(generated_ids_iter)
+                if generated_ids is None or len(generated_ids) == 0:
+                    raise pb_utils.TritonModelException("Generated IDs is None or empty")
+                audio = self.token2wav(generated_ids, global_token_ids)
+                # Prepare response
+                audio_tensor = pb_utils.Tensor.from_dlpack("waveform", to_dlpack(audio))
+                inference_response = pb_utils.InferenceResponse(output_tensors=[audio_tensor])
+                responses.append(inference_response)
+            if self.decoupled:
+                response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                self.logger.log_info(f"send tritonserver_response_complete_final to end")
+        if not self.decoupled:
+            return responses

runtime/triton_trtllm/model_repo/spark_tts/config.pbtxt ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "spark_tts"
+backend: "python"
+max_batch_size: ${triton_max_batch_size}
+dynamic_batching {
+    max_queue_delay_microseconds: ${max_queue_delay_microseconds}
+}
+model_transaction_policy {
+  decoupled: ${decoupled_mode}
+}
+parameters [
+  {
+   key: "llm_tokenizer_dir",
+   value: {string_value:"${llm_tokenizer_dir}"}
+  },
+  {
+   key: "audio_chunk_duration",
+   value: {string_value:"${audio_chunk_duration}"}
+  },
+  {
+   key: "audio_chunk_size_scale_factor",
+   value: {string_value:"${audio_chunk_size_scale_factor}"}
+  },
+  {
+   key: "max_audio_chunk_duration",
+   value: {string_value:"${max_audio_chunk_duration}"}
+  },
+  {
+   key: "audio_chunk_overlap_duration",
+   value: {string_value:"${audio_chunk_overlap_duration}"}
+  },
+  {
+   key: "audio_tokenizer_frame_rate",
+   value: {string_value:"50"}
+  }
+]
+input [
+  {
+    name: "reference_wav"
+    data_type: TYPE_FP32
+    dims: [-1]
+  },
+  {
+    name: "reference_wav_len"
+    data_type: TYPE_INT32
+    dims: [1]
+  },
+  {
+    name: "reference_text"
+    data_type: TYPE_STRING
+    dims: [1]
+  },
+  {
+    name: "target_text"
+    data_type: TYPE_STRING
+    dims: [1]
+  }
+]
+output [
+  {
+    name: "waveform"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+instance_group [
+  {
+    count: ${bls_instance_num}
+    kind: KIND_CPU
+  }
+]

runtime/triton_trtllm/model_repo/tensorrt_llm/1/.gitkeep ADDED Viewed

File without changes

runtime/triton_trtllm/model_repo/tensorrt_llm/config.pbtxt ADDED Viewed

	@@ -0,0 +1,857 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+name: "tensorrt_llm"
+backend: "${triton_backend}"
+max_batch_size: ${triton_max_batch_size}
+model_transaction_policy {
+  decoupled: ${decoupled_mode}
+}
+dynamic_batching {
+    preferred_batch_size: [ ${triton_max_batch_size} ]
+    max_queue_delay_microseconds: ${max_queue_delay_microseconds}
+    default_queue_policy: { max_queue_size: ${max_queue_size} }
+}
+input [
+  {
+    name: "input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    allow_ragged_batch: true
+    optional: true
+  },
+  {
+    name: "encoder_input_features"
+    data_type: ${encoder_input_features_data_type}
+    dims: [ -1, -1 ]
+    allow_ragged_batch: true
+    optional: true
+  },
+  {
+    name: "encoder_output_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "input_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "request_output_len"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "num_return_sequences"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "draft_input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "decoder_input_ids"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "decoder_input_lengths"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "draft_logits"
+    data_type: ${logits_datatype}
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "draft_acceptance_threshold"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "end_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "pad_id"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "stop_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "bad_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "embedding_bias"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "beam_width"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_k"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_min"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_decay"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p_reset_ids"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "len_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "early_stopping"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "min_length"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "beam_search_diversity_rate"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "presence_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "frequency_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "random_seed"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_context_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_generation_logits"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "return_perf_metrics"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "exclude_input_in_output"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "stop"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "streaming"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "prompt_embedding_table"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "prompt_table_extra_ids"
+    data_type: TYPE_UINT64
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "prompt_vocab_size"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  # cross_attention_mask shape `[bs, seq_len, num_images*num_tiles]`
+  {
+    name: "cross_attention_mask"
+    data_type: TYPE_BOOL
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  # Mrope param when mrope is used
+  {
+    name: "mrope_rotary_cos_sin"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+    optional: true
+  },
+  {
+    name: "mrope_position_deltas"
+    data_type: TYPE_INT64
+    dims: [ 1 ]
+    optional: true
+  },
+  # the unique task ID for the given LoRA.
+  # To perform inference with a specific LoRA for the first time `lora_task_id` `lora_weights` and `lora_config` must all be given.
+  # The LoRA will be cached, so that subsequent requests for the same task only require `lora_task_id`.
+  # If the cache is full the oldest LoRA will be evicted to make space for new ones.  An error is returned if `lora_task_id` is not cached.
+  {
+    name: "lora_task_id"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  # weights for a lora adapter shape [ num_lora_modules_layers, D x Hi + Ho x D ]
+  # where the last dimension holds the in / out adapter weights for the associated module (e.g. attn_qkv) and model layer
+  # each of the in / out tensors are first flattened and then concatenated together in the format above.
+  # D=adapter_size (R value), Hi=hidden_size_in, Ho=hidden_size_out.
+  {
+    name: "lora_weights"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  # module identifier (same size a first dimension of lora_weights)
+  # See LoraModule::ModuleType for model id mapping
+  #
+  # "attn_qkv": 0     # compbined qkv adapter
+  # "attn_q": 1       # q adapter
+  # "attn_k": 2       # k adapter
+  # "attn_v": 3       # v adapter
+  # "attn_dense": 4   # adapter for the dense layer in attention
+  # "mlp_h_to_4h": 5  # for llama2 adapter for gated mlp layer after attention / RMSNorm: up projection
+  # "mlp_4h_to_h": 6  # for llama2 adapter for gated mlp layer after attention / RMSNorm: down projection
+  # "mlp_gate": 7     # for llama2 adapter for gated mlp later after attention / RMSNorm: gate
+  #
+  # last dim holds [ module_id, layer_idx, adapter_size (D aka R value) ]
+  {
+    name: "lora_config"
+    data_type: TYPE_INT32
+    dims: [ -1, 3 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "context_phase_params"
+    data_type: TYPE_UINT8
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  # skip_cross_attn_blocks shape `[bs, 1]`, only used in mllama
+  {
+    name: "skip_cross_attn_blocks"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "retention_token_range_starts"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "retention_token_range_ends"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "retention_token_range_priorities"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "retention_token_range_durations_ms"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "retention_decode_priority"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "retention_decode_duration_ms"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "guided_decoding_guide_type"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "guided_decoding_guide"
+    data_type: TYPE_STRING
+    dims: [ 1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "lookahead_window_size"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "lookahead_ngram_size"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+    allow_ragged_batch: true
+  },
+  {
+    name: "lookahead_verification_set_size"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    optional: true
+    allow_ragged_batch: true
+  }
+]
+output [
+  {
+    name: "output_ids"
+    data_type: TYPE_INT32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "sequence_length"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "context_logits"
+    data_type: ${logits_datatype}
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "generation_logits"
+    data_type: ${logits_datatype}
+    dims: [ -1, -1, -1 ]
+  },
+  {
+    name: "batch_index"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "sequence_index"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "context_phase_params"
+    data_type: TYPE_UINT8
+    dims: [ -1 ]
+  },
+  {
+    name: "kv_cache_alloc_new_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "kv_cache_reused_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "kv_cache_alloc_total_blocks"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "arrival_time_ns"
+    data_type: TYPE_INT64
+    dims: [ 1 ]
+  },
+  {
+    name: "first_scheduled_time_ns"
+    data_type: TYPE_INT64
+    dims: [ 1 ]
+  },
+  {
+    name: "first_token_time_ns"
+    data_type: TYPE_INT64
+    dims: [ 1 ]
+  },
+  {
+    name: "last_token_time_ns"
+    data_type: TYPE_INT64
+    dims: [ 1 ]
+  },
+  {
+    name: "acceptance_rate"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+  },
+  {
+    name: "total_accepted_draft_tokens"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  },
+  {
+    name: "total_draft_tokens"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+  }
+]
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
+parameters: {
+  key: "max_beam_width"
+  value: {
+    string_value: "${max_beam_width}"
+  }
+}
+parameters: {
+  key: "FORCE_CPU_ONLY_INPUT_TENSORS"
+  value: {
+    string_value: "no"
+  }
+}
+parameters: {
+  key: "gpt_model_type"
+  value: {
+    string_value: "${batching_strategy}"
+  }
+}
+parameters: {
+  key: "gpt_model_path"
+  value: {
+    string_value: "${engine_dir}"
+  }
+}
+parameters: {
+  key: "encoder_model_path"
+  value: {
+    string_value: "${encoder_engine_dir}"
+  }
+}
+parameters: {
+  key: "max_tokens_in_paged_kv_cache"
+  value: {
+    string_value: "${max_tokens_in_paged_kv_cache}"
+  }
+}
+parameters: {
+  key: "max_attention_window_size"
+  value: {
+    string_value: "${max_attention_window_size}"
+  }
+}
+parameters: {
+  key: "sink_token_length"
+  value: {
+    string_value: "${sink_token_length}"
+  }
+}
+parameters: {
+  key: "batch_scheduler_policy"
+  value: {
+    string_value: "${batch_scheduler_policy}"
+  }
+}
+parameters: {
+  key: "kv_cache_free_gpu_mem_fraction"
+  value: {
+    string_value: "${kv_cache_free_gpu_mem_fraction}"
+  }
+}
+parameters: {
+  key: "cross_kv_cache_fraction"
+  value: {
+    string_value: "${cross_kv_cache_fraction}"
+  }
+}
+parameters: {
+  key: "kv_cache_host_memory_bytes"
+  value: {
+    string_value: "${kv_cache_host_memory_bytes}"
+  }
+}
+# kv_cache_onboard_blocks is for internal implementation.
+parameters: {
+  key: "kv_cache_onboard_blocks"
+  value: {
+    string_value: "${kv_cache_onboard_blocks}"
+  }
+}
+# enable_trt_overlap is deprecated and doesn't have any effect on the runtime
+# parameters: {
+#   key: "enable_trt_overlap"
+#   value: {
+#     string_value: "${enable_trt_overlap}"
+#   }
+# }
+parameters: {
+  key: "exclude_input_in_output"
+  value: {
+    string_value: "${exclude_input_in_output}"
+  }
+}
+parameters: {
+  key: "cancellation_check_period_ms"
+  value: {
+    string_value: "${cancellation_check_period_ms}"
+  }
+}
+parameters: {
+  key: "stats_check_period_ms"
+  value: {
+    string_value: "${stats_check_period_ms}"
+  }
+}
+parameters: {
+  key: "iter_stats_max_iterations"
+  value: {
+    string_value: "${iter_stats_max_iterations}"
+  }
+}
+parameters: {
+  key: "request_stats_max_iterations"
+  value: {
+    string_value: "${request_stats_max_iterations}"
+  }
+}
+parameters: {
+  key: "enable_kv_cache_reuse"
+  value: {
+    string_value: "${enable_kv_cache_reuse}"
+  }
+}
+parameters: {
+  key: "normalize_log_probs"
+  value: {
+    string_value: "${normalize_log_probs}"
+  }
+}
+parameters: {
+  key: "enable_chunked_context"
+  value: {
+    string_value: "${enable_chunked_context}"
+  }
+}
+parameters: {
+  key: "gpu_device_ids"
+  value: {
+    string_value: "${gpu_device_ids}"
+  }
+}
+parameters: {
+  key: "participant_ids"
+  value: {
+    string_value: "${participant_ids}"
+  }
+}
+parameters: {
+  key: "lora_cache_optimal_adapter_size"
+  value: {
+    string_value: "${lora_cache_optimal_adapter_size}"
+  }
+}
+parameters: {
+  key: "lora_cache_max_adapter_size"
+  value: {
+    string_value: "${lora_cache_max_adapter_size}"
+  }
+}
+parameters: {
+  key: "lora_cache_gpu_memory_fraction"
+  value: {
+    string_value: "${lora_cache_gpu_memory_fraction}"
+  }
+}
+parameters: {
+  key: "lora_cache_host_memory_bytes"
+  value: {
+    string_value: "${lora_cache_host_memory_bytes}"
+  }
+}
+parameters: {
+  key: "lora_prefetch_dir"
+  value: {
+    string_value: "${lora_prefetch_dir}"
+  }
+}
+parameters: {
+  key: "decoding_mode"
+  value: {
+    string_value: "${decoding_mode}"
+  }
+}
+parameters: {
+  key: "executor_worker_path"
+  value: {
+    string_value: "/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker"
+  }
+}
+parameters: {
+  key: "lookahead_window_size"
+    value: {
+      string_value: "${lookahead_window_size}"
+  }
+}
+parameters: {
+  key: "lookahead_ngram_size"
+    value: {
+      string_value: "${lookahead_ngram_size}"
+  }
+}
+parameters: {
+  key: "lookahead_verification_set_size"
+    value: {
+      string_value: "${lookahead_verification_set_size}"
+  }
+}
+parameters: {
+  key: "medusa_choices"
+    value: {
+      string_value: "${medusa_choices}"
+  }
+}
+parameters: {
+  key: "eagle_choices"
+    value: {
+      string_value: "${eagle_choices}"
+  }
+}
+parameters: {
+  key: "gpu_weights_percent"
+    value: {
+      string_value: "${gpu_weights_percent}"
+  }
+}
+parameters: {
+  key: "enable_context_fmha_fp32_acc"
+  value: {
+    string_value: "${enable_context_fmha_fp32_acc}"
+  }
+}
+parameters: {
+  key: "multi_block_mode"
+  value: {
+    string_value: "${multi_block_mode}"
+  }
+}
+parameters: {
+  key: "cuda_graph_mode"
+  value: {
+    string_value: "${cuda_graph_mode}"
+  }
+}
+parameters: {
+  key: "cuda_graph_cache_size"
+  value: {
+    string_value: "${cuda_graph_cache_size}"
+  }
+}
+parameters: {
+  key: "speculative_decoding_fast_logits"
+  value: {
+    string_value: "${speculative_decoding_fast_logits}"
+  }
+}
+parameters: {
+  key: "tokenizer_dir"
+  value: {
+    string_value: "${tokenizer_dir}"
+  }
+}
+parameters: {
+  key: "guided_decoding_backend"
+  value: {
+    string_value: "${guided_decoding_backend}"
+  }
+}
+parameters: {
+  key: "xgrammar_tokenizer_info_path"
+  value: {
+    string_value: "${xgrammar_tokenizer_info_path}"
+  }
+}

runtime/triton_trtllm/model_repo/vocoder/1/model.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import json
+import os
+import logging
+from typing import List, Dict
+import torch
+from torch.utils.dlpack import to_dlpack
+import triton_python_backend_utils as pb_utils
+from sparktts.models.bicodec import BiCodec
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class TritonPythonModel:
+    """Triton Python model for vocoder.
+    This model takes global and semantic tokens as input and generates audio waveforms
+    using the BiCodec vocoder.
+    """
+    def initialize(self, args):
+        """Initialize the model.
+        Args:
+            args: Dictionary containing model configuration
+        """
+        # Parse model parameters
+        parameters = json.loads(args['model_config'])['parameters']
+        model_params = {key: value["string_value"] for key, value in parameters.items()}
+        model_dir = model_params["model_dir"]
+        # Initialize device and vocoder
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logger.info(f"Initializing vocoder from {model_dir} on {self.device}")
+        self.vocoder = BiCodec.load_from_checkpoint(f"{model_dir}/BiCodec")
+        del self.vocoder.encoder, self.vocoder.postnet
+        self.vocoder.eval().to(self.device)  # Set model to evaluation mode
+        logger.info("Vocoder initialized successfully")
+    def execute(self, requests):
+        """Execute inference on the batched requests.
+        Args:
+            requests: List of inference requests
+        Returns:
+            List of inference responses containing generated waveforms
+        """
+        global_tokens_list, semantic_tokens_list = [], []
+        # Process each request in batch
+        for request in requests:
+            global_tokens_tensor = pb_utils.get_input_tensor_by_name(request, "global_tokens").as_numpy()
+            semantic_tokens_tensor = pb_utils.get_input_tensor_by_name(request, "semantic_tokens").as_numpy()
+            global_tokens_list.append(torch.from_numpy(global_tokens_tensor).to(self.device))
+            semantic_tokens_list.append(torch.from_numpy(semantic_tokens_tensor).to(self.device))
+        # Concatenate tokens for batch processing
+        global_tokens = torch.cat(global_tokens_list, dim=0)
+        semantic_tokens = torch.cat(semantic_tokens_list, dim=0)
+        # Generate waveforms
+        with torch.no_grad():
+            wavs = self.vocoder.detokenize(semantic_tokens, global_tokens.unsqueeze(1))
+        # Prepare responses
+        responses = []
+        for i in range(len(requests)):
+            wav_tensor = pb_utils.Tensor.from_dlpack("waveform", to_dlpack(wavs[i]))
+            inference_response = pb_utils.InferenceResponse(output_tensors=[wav_tensor])
+            responses.append(inference_response)
+        return responses

runtime/triton_trtllm/model_repo/vocoder/config.pbtxt ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "vocoder"
+backend: "python"
+max_batch_size: ${triton_max_batch_size}
+dynamic_batching {
+    max_queue_delay_microseconds: ${max_queue_delay_microseconds}
+}
+parameters [
+  {
+   key: "model_dir",
+   value: {string_value:"${model_dir}"}
+  }
+]
+input [
+  {
+    name: "global_tokens"
+    data_type: TYPE_INT32
+    dims: [-1]
+  },
+  {
+    name: "semantic_tokens"
+    data_type: TYPE_INT32
+    dims: [-1]
+  }
+]
+output [
+  {
+    name: "waveform"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+instance_group [
+  {
+    count: 1
+    kind: KIND_CPU
+  }
+]

runtime/triton_trtllm/run.sh ADDED Viewed

	@@ -0,0 +1,109 @@

+export PYTHONPATH=../../../Spark-TTS/
+export CUDA_VISIBLE_DEVICES=0
+stage=$1
+stop_stage=$2
+service_type=$3
+echo "Start stage: $stage, Stop stage: $stop_stage service_type: $service_type"
+huggingface_model_local_dir=../../pretrained_models/Spark-TTS-0.5B
+trt_dtype=bfloat16
+trt_weights_dir=./tllm_checkpoint_${trt_dtype}
+trt_engines_dir=./trt_engines_${trt_dtype}
+model_repo=./model_repo_test
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+    echo "Downloading Spark-TTS-0.5B from HuggingFace"
+    huggingface-cli download SparkAudio/Spark-TTS-0.5B --local-dir $huggingface_model_local_dir || exit 1
+fi
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+    echo "Converting checkpoint to TensorRT weights"
+    python scripts/convert_checkpoint.py --model_dir $huggingface_model_local_dir/LLM \
+                                --output_dir $trt_weights_dir \
+                                --dtype $trt_dtype || exit 1
+    echo "Building TensorRT engines"
+    trtllm-build --checkpoint_dir $trt_weights_dir \
+                --output_dir $trt_engines_dir \
+                --max_batch_size 16 \
+                --max_num_tokens 32768 \
+                --gemm_plugin $trt_dtype || exit 1
+fi
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+    echo "Creating model repository"
+    rm -rf $model_repo
+    mkdir -p $model_repo
+    spark_tts_dir="spark_tts"
+    cp -r ./model_repo/${spark_tts_dir} $model_repo
+    cp -r ./model_repo/audio_tokenizer $model_repo
+    cp -r ./model_repo/tensorrt_llm $model_repo
+    cp -r ./model_repo/vocoder $model_repo
+    ENGINE_PATH=$trt_engines_dir
+    MAX_QUEUE_DELAY_MICROSECONDS=0
+    MODEL_DIR=$huggingface_model_local_dir
+    LLM_TOKENIZER_DIR=$huggingface_model_local_dir/LLM
+    BLS_INSTANCE_NUM=4
+    TRITON_MAX_BATCH_SIZE=16
+    # streaming TTS parameters
+    AUDIO_CHUNK_DURATION=1.0
+    MAX_AUDIO_CHUNK_DURATION=30.0
+    AUDIO_CHUNK_SIZE_SCALE_FACTOR=8.0
+    AUDIO_CHUNK_OVERLAP_DURATION=0.1
+    python3 scripts/fill_template.py -i ${model_repo}/vocoder/config.pbtxt model_dir:${MODEL_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS}
+    python3 scripts/fill_template.py -i ${model_repo}/audio_tokenizer/config.pbtxt model_dir:${MODEL_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS}
+    if [ "$service_type" == "streaming" ]; then
+        DECOUPLED_MODE=True
+    else
+        DECOUPLED_MODE=False
+    fi
+    python3 scripts/fill_template.py -i ${model_repo}/${spark_tts_dir}/config.pbtxt bls_instance_num:${BLS_INSTANCE_NUM},llm_tokenizer_dir:${LLM_TOKENIZER_DIR},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},audio_chunk_duration:${AUDIO_CHUNK_DURATION},max_audio_chunk_duration:${MAX_AUDIO_CHUNK_DURATION},audio_chunk_size_scale_factor:${AUDIO_CHUNK_SIZE_SCALE_FACTOR},audio_chunk_overlap_duration:${AUDIO_CHUNK_OVERLAP_DURATION}
+    python3 scripts/fill_template.py -i ${model_repo}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},max_beam_width:1,engine_dir:${ENGINE_PATH},max_tokens_in_paged_kv_cache:2560,max_attention_window_size:2560,kv_cache_free_gpu_mem_fraction:0.5,exclude_input_in_output:True,enable_kv_cache_reuse:False,batching_strategy:inflight_fused_batching,max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32
+fi
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+    echo "Starting Triton server"
+    tritonserver --model-repository ${model_repo}
+fi
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+    echo "Running benchmark client"
+    num_task=2
+    if [ "$service_type" == "streaming" ]; then
+        mode="streaming"
+    else
+        mode="offline"
+    fi
+    python3 client_grpc.py \
+        --server-addr localhost \
+        --model-name spark_tts \
+        --num-tasks $num_task \
+        --mode $mode \
+        --log-dir ./log_concurrent_tasks_${num_task}_${mode}_new
+fi
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+    echo "Running single utterance client"
+    if [ "$service_type" == "streaming" ]; then
+        python client_grpc.py \
+            --server-addr localhost \
+            --reference-audio ../../example/prompt_audio.wav \
+            --reference-text "吃燕窝就选燕之屋，本节目由26年专注高品质燕窝的燕之屋冠名播出。豆奶牛奶换着喝，营养更均衡，本节目由豆本豆豆奶特约播出。" \
+            --target-text "身临其境，换新体验。塑造开源语音合成新范式，让智能语音更自然。" \
+            --model-name spark_tts \
+            --chunk-overlap-duration 0.1 \
+            --mode streaming
+    else
+        python client_http.py \
+            --reference-audio ../../example/prompt_audio.wav \
+            --reference-text "吃燕窝就选燕之屋，本节目由26年专注高品质燕窝的燕之屋冠名播出。豆奶牛奶换着喝，营养更均衡，本节目由豆本豆豆奶特约播出。" \
+            --target-text "身临其境，换新体验。塑造开源语音合成新范式，让智能语音更自然。" \
+            --model-name spark_tts
+    fi
+fi

runtime/triton_trtllm/scripts/convert_checkpoint.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import argparse
+import os
+import time
+import traceback
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from transformers import AutoConfig
+import tensorrt_llm
+from tensorrt_llm._utils import release_gc
+from tensorrt_llm.logger import logger
+from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models import QWenForCausalLM
+from tensorrt_llm.models.modeling_utils import QuantConfig
+from tensorrt_llm.quantization import QuantAlgo
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_dir', type=str, default=None, required=True)
+    parser.add_argument('--tp_size',
+                        type=int,
+                        default=1,
+                        help='N-way tensor parallelism size')
+    parser.add_argument('--pp_size',
+                        type=int,
+                        default=1,
+                        help='N-way pipeline parallelism size')
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['auto', 'float16', 'bfloat16', 'float32'],
+        help=
+        "The data type for the model weights and activations if not quantized. "
+        "If 'auto', the data type is automatically inferred from the source model; "
+        "however, if the source dtype is float32, it is converted to float16.")
+    parser.add_argument(
+        '--use_weight_only',
+        default=False,
+        action="store_true",
+        help='Quantize weights for the various GEMMs to INT4/INT8.'
+        'See --weight_only_precision to set the precision')
+    parser.add_argument(
+        '--disable_weight_only_quant_plugin',
+        default=False,
+        action="store_true",
+        help=
+        'By default, using plugin implementation for weight quantization. Enabling disable_weight_only_quant_plugin flag will use ootb implementation instead of plugin.'
+        'You must also use --use_weight_only for that argument to have an impact.'
+    )
+    parser.add_argument(
+        '--weight_only_precision',
+        const='int8',
+        type=str,
+        nargs='?',
+        default='int8',
+        choices=['int8', 'int4', 'int4_gptq'],
+        help=
+        'Define the precision for the weights when using weight-only quantization.'
+        'You must also use --use_weight_only for that argument to have an impact.'
+    )
+    parser.add_argument(
+        '--calib_dataset',
+        type=str,
+        default='ccdv/cnn_dailymail',
+        help=
+        "The huggingface dataset name or the local directory of the dataset for calibration."
+    )
+    parser.add_argument(
+        "--smoothquant",
+        "-sq",
+        type=float,
+        default=None,
+        help="Set the α parameter (see https://arxiv.org/pdf/2211.10438.pdf)"
+        " to Smoothquant the model, and output int8 weights."
+        " A good first try is 0.5. Must be in [0, 1]")
+    parser.add_argument(
+        '--per_channel',
+        action="store_true",
+        default=False,
+        help=
+        'By default, we use a single static scaling factor for the GEMM\'s result. '
+        'per_channel instead uses a different static scaling factor for each channel. '
+        'The latter is usually more accurate, but a little slower.')
+    parser.add_argument(
+        '--per_token',
+        action="store_true",
+        default=False,
+        help=
+        'By default, we use a single static scaling factor to scale activations in the int8 range. '
+        'per_token chooses at run time, and for each token, a custom scaling factor. '
+        'The latter is usually more accurate, but a little slower.')
+    parser.add_argument(
+        '--int8_kv_cache',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
+    )
+    parser.add_argument(
+        '--per_group',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use a single static scaling factor to scale weights in the int4 range. '
+        'per_group chooses at run time, and for each group, a custom scaling factor. '
+        'The flag is built for GPTQ/AWQ quantization.')
+    parser.add_argument('--group_size',
+                        type=int,
+                        default=128,
+                        help='Group size used in GPTQ quantization.')
+    parser.add_argument("--load_model_on_cpu", action="store_true")
+    parser.add_argument(
+        '--use_parallel_embedding',
+        action="store_true",
+        default=False,
+        help=
+        'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled'
+    )
+    parser.add_argument(
+        '--embedding_sharding_dim',
+        type=int,
+        default=0,
+        choices=[0, 1],
+        help=
+        'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). '
+        'To shard it along hidden dimension, set embedding_sharding_dim=1'
+        'Note: embedding sharing is only enabled when embedding_sharding_dim = 0'
+    )
+    parser.add_argument('--output_dir',
+                        type=str,
+                        default='tllm_checkpoint',
+                        help='The path to save the TensorRT-LLM checkpoint')
+    parser.add_argument(
+        '--workers',
+        type=int,
+        default=1,
+        help='The number of workers for converting checkpoint in parallel')
+    parser.add_argument(
+        '--moe_tp_size',
+        type=int,
+        default=-1,
+        help=
+        'N-way tensor parallelism size for MOE, default is tp_size, which will do tp-only for MoE'
+    )
+    parser.add_argument(
+        '--moe_ep_size',
+        type=int,
+        default=-1,
+        help=
+        'N-way expert parallelism size for MOE, default is 1, which will do tp-only for MoE'
+    )
+    args = parser.parse_args()
+    return args
+def args_to_quant_config(args: argparse.Namespace) -> QuantConfig:
+    '''return config dict with quantization info based on the command line args
+    '''
+    quant_config = QuantConfig()
+    if args.use_weight_only:
+        if args.weight_only_precision == 'int8':
+            quant_config.quant_algo = QuantAlgo.W8A16
+        elif args.weight_only_precision == 'int4':
+            quant_config.quant_algo = QuantAlgo.W4A16
+    elif args.smoothquant:
+        quant_config.smoothquant_val = args.smoothquant
+        if args.per_channel:
+            if args.per_token:
+                quant_config.quant_algo = QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN
+            else:
+                quant_config.quant_algo = QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN
+        else:
+            if args.per_token:
+                quant_config.quant_algo = QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN
+            else:
+                quant_config.quant_algo = QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN
+    if args.int8_kv_cache:
+        quant_config.kv_cache_quant_algo = QuantAlgo.INT8
+    if args.weight_only_precision == 'int4_gptq':
+        quant_config.group_size = args.group_size
+        quant_config.has_zero_point = True
+        quant_config.pre_quant_scale = False
+        quant_config.quant_algo = QuantAlgo.W4A16_GPTQ
+    return quant_config
+def update_quant_config_from_hf(quant_config, hf_config,
+                                override_fields) -> tuple[QuantConfig, dict]:
+    hf_config_dict = hf_config.to_dict()
+    if hf_config_dict.get('quantization_config'):
+        # update the quant_algo, and clamp_val.
+        if hf_config_dict['quantization_config'].get('quant_method') == 'awq':
+            logger.info(
+                "Load quantization configs from huggingface model_config.")
+            quant_config.quant_algo = QuantAlgo.W4A16_GPTQ
+            quant_config.group_size = hf_config_dict['quantization_config'].get(
+                'group_size', 128)
+            quant_config.has_zero_point = hf_config_dict[
+                'quantization_config'].get('zero_point', False)
+            override_fields.update({"use_autoawq": True})
+        elif hf_config_dict['quantization_config'].get(
+                'quant_method') == 'gptq':
+            logger.info(
+                "Load quantization configs from huggingface model_config.")
+            desc_act = hf_config_dict['quantization_config'].get(
+                'desc_act', False)
+            if desc_act:
+                raise ValueError("GPTQ with desc_act=True is not implemented!")
+            quant_config.quant_algo = QuantAlgo.W4A16_GPTQ
+            quant_config.group_size = hf_config_dict['quantization_config'].get(
+                'group_size', 128)
+            quant_config.has_zero_point = hf_config_dict[
+                'quantization_config'].get('sym', False)
+    return quant_config, override_fields
+def args_to_build_options(args):
+    return {
+        'use_parallel_embedding': args.use_parallel_embedding,
+        'embedding_sharding_dim': args.embedding_sharding_dim,
+        'disable_weight_only_quant_plugin':
+        args.disable_weight_only_quant_plugin
+    }
+def convert_and_save_hf(args):
+    model_dir = args.model_dir
+    world_size = args.tp_size * args.pp_size
+    # Need to convert the cli args to the kay-value pairs and override them in the generate config dict.
+    # Ideally these fields will be moved out of the config and pass them into build API, keep them here for compatibility purpose for now,
+    # before the refactor is done.
+    override_fields = {}
+    override_fields.update(args_to_build_options(args))
+    quant_config = args_to_quant_config(args)
+    try:
+        hf_config = AutoConfig.from_pretrained(model_dir,
+                                               trust_remote_code=True)
+        quant_config, override_fields = update_quant_config_from_hf(
+            quant_config, hf_config, override_fields)
+    except:
+        logger.warning("AutoConfig cannot load the huggingface config.")
+    if args.smoothquant is not None or args.int8_kv_cache:
+        mapping = Mapping(
+            world_size=world_size,
+            tp_size=args.tp_size,
+            pp_size=args.pp_size,
+            moe_tp_size=args.moe_tp_size,
+            moe_ep_size=args.moe_ep_size,
+        )
+        QWenForCausalLM.quantize(args.model_dir,
+                                 args.output_dir,
+                                 dtype=args.dtype,
+                                 mapping=mapping,
+                                 quant_config=quant_config,
+                                 calib_dataset=args.calib_dataset,
+                                 **override_fields)
+    else:
+        def convert_and_save_rank(args, rank):
+            mapping = Mapping(world_size=world_size,
+                              rank=rank,
+                              tp_size=args.tp_size,
+                              pp_size=args.pp_size,
+                              moe_tp_size=args.moe_tp_size,
+                              moe_ep_size=args.moe_ep_size)
+            qwen = QWenForCausalLM.from_hugging_face(model_dir,
+                                                     args.dtype,
+                                                     mapping=mapping,
+                                                     quant_config=quant_config,
+                                                     **override_fields)
+            qwen.save_checkpoint(args.output_dir, save_config=(rank == 0))
+            del qwen
+        execute(args.workers, [convert_and_save_rank] * world_size, args)
+        release_gc()
+def execute(workers, func, args):
+    if workers == 1:
+        for rank, f in enumerate(func):
+            f(args, rank)
+    else:
+        with ThreadPoolExecutor(max_workers=workers) as p:
+            futures = [p.submit(f, args, rank) for rank, f in enumerate(func)]
+            exceptions = []
+            for future in as_completed(futures):
+                try:
+                    future.result()
+                except Exception as e:
+                    traceback.print_exc()
+                    exceptions.append(e)
+            assert len(
+                exceptions
+            ) == 0, "Checkpoint conversion failed, please check error log."
+def main():
+    print(tensorrt_llm.__version__)
+    args = parse_arguments()
+    if (args.moe_tp_size == -1 and args.moe_ep_size == -1):
+        # moe default to tp-only
+        args.moe_tp_size = args.tp_size
+        args.moe_ep_size = 1
+    elif (args.moe_tp_size == -1):
+        args.moe_tp_size = args.tp_size // args.moe_ep_size
+    elif (args.moe_ep_size == -1):
+        args.moe_ep_size = args.tp_size // args.moe_tp_size
+    assert (args.moe_tp_size * args.moe_ep_size == args.tp_size
+            ), "moe_tp_size * moe_ep_size must equal to tp_size"
+    tik = time.time()
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    assert args.model_dir is not None
+    convert_and_save_hf(args)
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    print(f'Total time of converting checkpoints: {t}')
+if __name__ == '__main__':
+    main()

runtime/triton_trtllm/scripts/fill_template.py ADDED Viewed

	@@ -0,0 +1,70 @@

+#! /usr/bin/env python3
+from argparse import ArgumentParser
+from string import Template
+def split(string, delimiter):
+    """Split a string using delimiter. Supports escaping.
+    Args:
+        string (str): The string to split.
+        delimiter (str): The delimiter to split the string with.
+    Returns:
+        list: A list of strings.
+    """
+    result = []
+    current = ""
+    escape = False
+    for char in string:
+        if escape:
+            current += char
+            escape = False
+        elif char == delimiter:
+            result.append(current)
+            current = ""
+        elif char == "\\":
+            escape = True
+        else:
+            current += char
+    result.append(current)
+    return result
+def main(file_path, substitutions, in_place):
+    with open(file_path) as f:
+        pbtxt = Template(f.read())
+    sub_dict = {
+        "max_queue_size": 0,
+        'max_queue_delay_microseconds': 0,
+    }
+    for sub in split(substitutions, ","):
+        key, value = split(sub, ":")
+        sub_dict[key] = value
+        assert key in pbtxt.template, f"key '{key}' does not exist in the file {file_path}."
+    pbtxt = pbtxt.safe_substitute(sub_dict)
+    if in_place:
+        with open(file_path, "w") as f:
+            f.write(pbtxt)
+    else:
+        print(pbtxt)
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("file_path", help="path of the .pbtxt to modify")
+    parser.add_argument(
+        "substitutions",
+        help=
+        "substitutions to perform, in the format variable_name_1:value_1,variable_name_2:value_2..."
+    )
+    parser.add_argument("--in_place",
+                        "-i",
+                        action="store_true",
+                        help="do the operation in-place")
+    args = parser.parse_args()
+    main(**vars(args))

sparktts/models/audio_tokenizer.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+from pathlib import Path
+from typing import Any, Dict, Tuple
+from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
+from sparktts.utils.file import load_config
+from sparktts.utils.audio import load_audio
+from sparktts.models.bicodec import BiCodec
+class BiCodecTokenizer:
+    """BiCodec tokenizer for handling audio input and tokenization."""
+    def __init__(self, model_dir: Path, device: torch.device = None, **kwargs):
+        super().__init__()
+        """
+        Args:
+            model_dir: Path to the model directory.
+            device: Device to run the model on (default is GPU if available).
+        """
+        self.device = device
+        self.model_dir = model_dir
+        self.config = load_config(f"{model_dir}/config.yaml")
+        self._initialize_model()
+    def _initialize_model(self):
+        """Load and initialize the BiCodec model and Wav2Vec2 feature extractor."""
+        self.model = BiCodec.load_from_checkpoint(f"{self.model_dir}/BiCodec").to(
+            self.device
+        )
+        self.processor = Wav2Vec2FeatureExtractor.from_pretrained(
+            f"{self.model_dir}/wav2vec2-large-xlsr-53"
+        )
+        self.feature_extractor = Wav2Vec2Model.from_pretrained(
+            f"{self.model_dir}/wav2vec2-large-xlsr-53"
+        ).to(self.device)
+        self.feature_extractor.config.output_hidden_states = True
+    def get_ref_clip(self, wav: np.ndarray) -> np.ndarray:
+        """Get reference audio clip for speaker embedding."""
+        ref_segment_length = (
+            int(self.config["sample_rate"] * self.config["ref_segment_duration"])
+            // self.config["latent_hop_length"]
+            * self.config["latent_hop_length"]
+        )
+        wav_length = len(wav)
+        if ref_segment_length > wav_length:
+            # Repeat and truncate to handle insufficient length
+            wav = np.tile(wav, ref_segment_length // wav_length + 1)
+        return wav[:ref_segment_length]
+    def process_audio(self, wav_path: Path) -> Tuple[np.ndarray, torch.Tensor]:
+        """load auido and get reference audio from wav path"""
+        wav = load_audio(
+            wav_path,
+            sampling_rate=self.config["sample_rate"],
+            volume_normalize=self.config["volume_normalize"],
+        )
+        wav_ref = self.get_ref_clip(wav)
+        wav_ref = torch.from_numpy(wav_ref).unsqueeze(0).float()
+        return wav, wav_ref
+    def extract_wav2vec2_features(self, wavs: torch.Tensor) -> torch.Tensor:
+        """extract wav2vec2 features"""
+        inputs = self.processor(
+            wavs,
+            sampling_rate=16000,
+            return_tensors="pt",
+            padding=True,
+            output_hidden_states=True,
+        ).input_values
+        feat = self.feature_extractor(inputs.to(self.feature_extractor.device))
+        feats_mix = (
+            feat.hidden_states[11] + feat.hidden_states[14] + feat.hidden_states[16]
+        ) / 3
+        return feats_mix
+    def tokenize_batch(self, batch: Dict[str, Any]) -> torch.Tensor:
+        """tokenize the batch of audio
+        Args:
+            batch:
+                wavs (List[np.ndarray]): batch of audio
+                ref_wavs (torch.Tensor): reference audio. shape: (batch_size, seq_len)
+        Returns:
+            semantic_tokens: semantic tokens. shape: (batch_size, seq_len, latent_dim)
+            global_tokens: global tokens. shape: (batch_size, seq_len, global_dim)
+        """
+        feats = self.extract_wav2vec2_features(batch["wav"])
+        batch["feat"] = feats
+        semantic_tokens, global_tokens = self.model.tokenize(batch)
+        return global_tokens, semantic_tokens
+    def tokenize(self, audio_path: str) -> Tuple[torch.Tensor, torch.Tensor]:
+        """tokenize the audio"""
+        wav, ref_wav = self.process_audio(audio_path)
+        feat = self.extract_wav2vec2_features(wav)
+        batch = {
+            "wav": torch.from_numpy(wav).unsqueeze(0).float().to(self.device),
+            "ref_wav": ref_wav.to(self.device),
+            "feat": feat.to(self.device),
+        }
+        semantic_tokens, global_tokens = self.model.tokenize(batch)
+        return global_tokens, semantic_tokens
+    def detokenize(
+        self, global_tokens: torch.Tensor, semantic_tokens: torch.Tensor
+    ) -> np.array:
+        """detokenize the tokens to waveform
+        Args:
+            global_tokens: global tokens. shape: (batch_size, global_dim)
+            semantic_tokens: semantic tokens. shape: (batch_size, latent_dim)
+        Returns:
+            wav_rec: waveform. shape: (batch_size, seq_len) for batch or (seq_len,) for single
+        """
+        global_tokens = global_tokens.unsqueeze(1)
+        wav_rec = self.model.detokenize(semantic_tokens, global_tokens)
+        return wav_rec.detach().squeeze().cpu().numpy()
+# test
+if __name__ == "__main__":
+    import soundfile as sf
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    tokenizer = BiCodecTokenizer(
+        model_dir="pretrained_models/Spark-TTS-0.5B",
+        device=device,
+    )
+    wav_path = "example/prompt_audio.wav"
+    global_tokens, semantic_tokens = tokenizer.tokenize(wav_path)
+    wav_rec = tokenizer.detokenize(global_tokens.squeeze(0), semantic_tokens)
+    sf.write("example/prompt_recon.wav", wav_rec, 16000)

sparktts/models/bicodec.py ADDED Viewed

	@@ -0,0 +1,247 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from pathlib import Path
+from typing import Dict, Any
+from omegaconf import DictConfig
+from safetensors.torch import load_file
+from sparktts.utils.file import load_config
+from sparktts.modules.speaker.speaker_encoder import SpeakerEncoder
+from sparktts.modules.encoder_decoder.feat_encoder import Encoder
+from sparktts.modules.encoder_decoder.feat_decoder import Decoder
+from sparktts.modules.encoder_decoder.wave_generator import WaveGenerator
+from sparktts.modules.vq.factorized_vector_quantize import FactorizedVectorQuantize
+class BiCodec(nn.Module):
+    """
+    BiCodec model for speech synthesis, incorporating a speaker encoder, feature encoder/decoder,
+    quantizer, and wave generator.
+    """
+    def __init__(
+        self,
+        mel_params: Dict[str, Any],
+        encoder: nn.Module,
+        decoder: nn.Module,
+        quantizer: nn.Module,
+        speaker_encoder: nn.Module,
+        prenet: nn.Module,
+        postnet: nn.Module,
+        **kwargs
+    ) -> None:
+        """
+        Initializes the BiCodec model with the required components.
+        Args:
+            mel_params (dict): Parameters for the mel-spectrogram transformer.
+            encoder (nn.Module): Encoder module.
+            decoder (nn.Module): Decoder module.
+            quantizer (nn.Module): Quantizer module.
+            speaker_encoder (nn.Module): Speaker encoder module.
+            prenet (nn.Module): Prenet network.
+            postnet (nn.Module): Postnet network.
+        """
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.quantizer = quantizer
+        self.speaker_encoder = speaker_encoder
+        self.prenet = prenet
+        self.postnet = postnet
+        self.init_mel_transformer(mel_params)
+    @classmethod
+    def load_from_checkpoint(cls, model_dir: Path, **kwargs) -> "BiCodec":
+        """
+        Loads the model from a checkpoint.
+        Args:
+            model_dir (Path): Path to the model directory containing checkpoint and config.
+        Returns:
+            BiCodec: The initialized BiCodec model.
+        """
+        ckpt_path = f'{model_dir}/model.safetensors'
+        config = load_config(f'{model_dir}/config.yaml')['audio_tokenizer']
+        mel_params = config["mel_params"]
+        encoder = Encoder(**config["encoder"])
+        quantizer = FactorizedVectorQuantize(**config["quantizer"])
+        prenet = Decoder(**config["prenet"])
+        postnet = Decoder(**config["postnet"])
+        decoder = WaveGenerator(**config["decoder"])
+        speaker_encoder = SpeakerEncoder(**config["speaker_encoder"])
+        model = cls(
+            mel_params=mel_params,
+            encoder=encoder,
+            decoder=decoder,
+            quantizer=quantizer,
+            speaker_encoder=speaker_encoder,
+            prenet=prenet,
+            postnet=postnet,
+        )
+        state_dict = load_file(ckpt_path)
+        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+        for key in missing_keys:
+            print(f"Missing tensor: {key}")
+        for key in unexpected_keys:
+            print(f"Unexpected tensor: {key}")
+        model.eval()
+        model.remove_weight_norm()
+        return model
+    def forward(self, batch: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Performs a forward pass through the model.
+        Args:
+            batch (dict): A dictionary containing features, reference waveform, and target waveform.
+        Returns:
+            dict: A dictionary containing the reconstruction, features, and other metrics.
+        """
+        feat = batch["feat"]
+        mel = self.mel_transformer(batch["ref_wav"]).squeeze(1)
+        z = self.encoder(feat.transpose(1, 2))
+        vq_outputs = self.quantizer(z)
+        x_vector, d_vector = self.speaker_encoder(mel.transpose(1, 2))
+        conditions = d_vector
+        with_speaker_loss = False
+        x = self.prenet(vq_outputs["z_q"], conditions)
+        pred_feat = self.postnet(x)
+        x = x + conditions.unsqueeze(-1)
+        wav_recon = self.decoder(x)
+        return {
+            "vq_loss": vq_outputs["vq_loss"],
+            "perplexity": vq_outputs["perplexity"],
+            "cluster_size": vq_outputs["active_num"],
+            "recons": wav_recon,
+            "pred_feat": pred_feat,
+            "x_vector": x_vector,
+            "d_vector": d_vector,
+            "audios": batch["wav"].unsqueeze(1),
+            "with_speaker_loss": with_speaker_loss,
+        }
+    @torch.no_grad()
+    def tokenize(self, batch: Dict[str, Any]):
+        """
+        Tokenizes the input audio into semantic and global tokens.
+        Args:
+            batch (dict): The input audio features and reference waveform.
+        Returns:
+            tuple: Semantic tokens and global tokens.
+        """
+        feat = batch["feat"]
+        mel = self.mel_transformer(batch["ref_wav"]).squeeze(1)
+        z = self.encoder(feat.transpose(1, 2))
+        semantic_tokens = self.quantizer.tokenize(z)
+        global_tokens = self.speaker_encoder.tokenize(mel.transpose(1, 2))
+        return semantic_tokens, global_tokens
+    @torch.no_grad()
+    def detokenize(self, semantic_tokens, global_tokens):
+        """
+        Detokenizes the semantic and global tokens into a waveform.
+        Args:
+            semantic_tokens (tensor): Semantic tokens.
+            global_tokens (tensor): Global tokens.
+        Returns:
+            tensor: Reconstructed waveform.
+        """
+        z_q = self.quantizer.detokenize(semantic_tokens)
+        d_vector = self.speaker_encoder.detokenize(global_tokens)
+        x = self.prenet(z_q, d_vector)
+        x = x + d_vector.unsqueeze(-1)
+        wav_recon = self.decoder(x)
+        return wav_recon
+    def init_mel_transformer(self, config: Dict[str, Any]):
+        """
+        Initializes the MelSpectrogram transformer based on the provided configuration.
+        Args:
+            config (dict): Configuration parameters for MelSpectrogram.
+        """
+        import torchaudio.transforms as TT
+        self.mel_transformer = TT.MelSpectrogram(
+            config["sample_rate"],
+            config["n_fft"],
+            config["win_length"],
+            config["hop_length"],
+            config["mel_fmin"],
+            config["mel_fmax"],
+            n_mels=config["num_mels"],
+            power=1,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+    def remove_weight_norm(self):
+        """Removes weight normalization from all layers."""
+        def _remove_weight_norm(m):
+            try:
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:
+                pass  # The module didn't have weight norm
+        self.apply(_remove_weight_norm)
+# Test the model
+if __name__ == "__main__":
+    config = load_config("pretrained_models/SparkTTS-0.5B/BiCodec/config.yaml")
+    model = BiCodec.load_from_checkpoint(
+        model_dir="pretrained_models/SparkTTS-0.5B/BiCodec",
+    )
+    # Generate random inputs for testing
+    duration = 0.96
+    x = torch.randn(20, 1, int(duration * 16000))
+    feat = torch.randn(20, int(duration * 50), 1024)
+    inputs = {"feat": feat, "wav": x, "ref_wav": x}
+    # Forward pass
+    outputs = model(inputs)
+    semantic_tokens, global_tokens = model.tokenize(inputs)
+    wav_recon = model.detokenize(semantic_tokens, global_tokens)
+    # Verify if the reconstruction matches
+    if torch.allclose(outputs["recons"].detach(), wav_recon):
+        print("Test successful")
+    else:
+        print("Test failed")

sparktts/modules/blocks/layers.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/descriptinc/descript-audio-codec under the Apache License 2.0
+import torch
+import torch.nn as nn
+from torch.nn.utils import weight_norm
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+# Scripting this brings model speed up 1.4x
+@torch.jit.script
+def snake(x, alpha):
+    shape = x.shape
+    x = x.reshape(shape[0], shape[1], -1)
+    x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
+    x = x.reshape(shape)
+    return x
+class Snake1d(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones(1, channels, 1))
+    def forward(self, x):
+        return snake(x, self.alpha)
+class ResidualUnit(nn.Module):
+    def __init__(self, dim: int = 16, dilation: int = 1):
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+        self.block = nn.Sequential(
+            Snake1d(dim),
+            WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
+            Snake1d(dim),
+            WNConv1d(dim, dim, kernel_size=1),
+        )
+    def forward(self, x):
+        y = self.block(x)
+        pad = (x.shape[-1] - y.shape[-1]) // 2
+        if pad > 0:
+            x = x[..., pad:-pad]
+        return x + y
+def init_weights(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        nn.init.constant_(m.bias, 0)

sparktts/modules/blocks/samper.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class SamplingBlock(nn.Module):
+    """Sampling block for upsampling or downsampling"""
+    def __init__(
+        self,
+        dim: int,
+        groups: int = 1,
+        upsample_scale: int = 1,
+        downsample_scale: int = 1,
+    ) -> None:
+        """
+        Args:
+            dim: input dimension
+            groups: number of groups
+            upsample_scale: upsampling scale
+            downsample_scale: downsampling scale
+        """
+        super(SamplingBlock, self).__init__()
+        self.upsample_scale = upsample_scale
+        self.downsample_scale = downsample_scale
+        if self.upsample_scale > 1:
+            self.de_conv_upsampler = nn.Sequential(
+                nn.LeakyReLU(0.2),
+                nn.ConvTranspose1d(
+                    dim,
+                    dim,
+                    kernel_size=upsample_scale * 2,
+                    stride=upsample_scale,
+                    padding=upsample_scale // 2 + upsample_scale % 2,
+                    output_padding=upsample_scale % 2,
+                    groups=groups,
+                ),
+            )
+        if self.downsample_scale > 1:
+            self.conv_downsampler = nn.Sequential(
+                nn.LeakyReLU(0.2),
+                nn.Conv1d(
+                    dim,
+                    dim,
+                    kernel_size=2 * downsample_scale,
+                    stride=downsample_scale,
+                    padding=downsample_scale // 2 + downsample_scale % 2,
+                    groups=groups,
+                ),
+            )
+    @staticmethod
+    def repeat_upsampler(x, upsample_scale):
+        return x.repeat_interleave(upsample_scale, dim=2)
+    @staticmethod
+    def skip_downsampler(x, downsample_scale):
+        return F.avg_pool1d(x, kernel_size=downsample_scale, stride=downsample_scale)
+    def forward(self, x):
+        x = x.transpose(1, 2)
+        if self.upsample_scale > 1:
+            repeat_res = self.repeat_upsampler(x, self.upsample_scale)
+            deconv_res = self.de_conv_upsampler(x)
+            upmerge_res = repeat_res + deconv_res
+        else:
+            upmerge_res = x
+            repeat_res = x
+        if self.downsample_scale > 1:
+            conv_res = self.conv_downsampler(upmerge_res)
+            skip2_res = self.skip_downsampler(upmerge_res, self.downsample_scale)
+            skip1_res = self.skip_downsampler(repeat_res, self.downsample_scale)
+        else:
+            conv_res = upmerge_res
+            skip2_res = upmerge_res
+            skip1_res = repeat_res
+        final_res = conv_res + skip1_res + skip2_res
+        return final_res
+# test
+if __name__ == "__main__":
+    test_input = torch.randn(8, 1024, 50)  # Batch size = 8, 1024 channels, length = 50
+    model = SamplingBlock(1024, 1024, upsample_scale=2)
+    model_down = SamplingBlock(1024, 1024, downsample_scale=2)
+    output = model(test_input)
+    output_down = model_down(test_input)
+    print("shape after upsample * 2", output.shape)  # torch.Size([8, 1024, 100])
+    print("shape after downsample * 2", output_down.shape)  # torch.Size([8, 1024, 25])
+    if output.shape == torch.Size([8, 1024, 100]) and output_down.shape == torch.Size(
+        [8, 1024, 25]
+    ):
+        print("test successful")

sparktts/modules/blocks/vocos.py ADDED Viewed

	@@ -0,0 +1,373 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from typing import Tuple
+from torch.nn.utils import weight_norm, remove_weight_norm
+from typing import Optional
+class ConvNeXtBlock(nn.Module):
+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+            None means non-conditional LayerNorm. Defaults to None.
+    """
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        layer_scale_init_value: float,
+        condition_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv1d(
+            dim, dim, kernel_size=7, padding=3, groups=dim
+        )  # depthwise conv
+        self.adanorm = condition_dim is not None
+        if condition_dim:
+            self.norm = AdaLayerNorm(condition_dim, dim, eps=1e-6)
+        else:
+            self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, intermediate_dim
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+    def forward(
+        self, x: torch.Tensor, cond_embedding_id: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        residual = x
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
+        if self.adanorm:
+            assert cond_embedding_id is not None
+            x = self.norm(x, cond_embedding_id)
+        else:
+            x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
+        x = residual + x
+        return x
+class AdaLayerNorm(nn.Module):
+    """
+    Adaptive Layer Normalization module with learnable embeddings per `num_embeddings` classes
+    Args:
+        condition_dim (int): Dimension of the condition.
+        embedding_dim (int): Dimension of the embeddings.
+    """
+    def __init__(self, condition_dim: int, embedding_dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.dim = embedding_dim
+        self.scale = nn.Linear(condition_dim, embedding_dim)
+        self.shift = nn.Linear(condition_dim, embedding_dim)
+        torch.nn.init.ones_(self.scale.weight)
+        torch.nn.init.zeros_(self.shift.weight)
+    def forward(self, x: torch.Tensor, cond_embedding: torch.Tensor) -> torch.Tensor:
+        scale = self.scale(cond_embedding)
+        shift = self.shift(cond_embedding)
+        x = nn.functional.layer_norm(x, (self.dim,), eps=self.eps)
+        x = x * scale.unsqueeze(1) + shift.unsqueeze(1)
+        return x
+class ResBlock1(nn.Module):
+    """
+    ResBlock adapted from HiFi-GAN V1 (https://github.com/jik876/hifi-gan) with dilated 1D convolutions,
+    but without upsampling layers.
+    Args:
+        dim (int): Number of input channels.
+        kernel_size (int, optional): Size of the convolutional kernel. Defaults to 3.
+        dilation (tuple[int], optional): Dilation factors for the dilated convolutions.
+            Defaults to (1, 3, 5).
+        lrelu_slope (float, optional): Negative slope of the LeakyReLU activation function.
+            Defaults to 0.1.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+    """
+    def __init__(
+        self,
+        dim: int,
+        kernel_size: int = 3,
+        dilation: Tuple[int, int, int] = (1, 3, 5),
+        lrelu_slope: float = 0.1,
+        layer_scale_init_value: Optional[float] = None,
+    ):
+        super().__init__()
+        self.lrelu_slope = lrelu_slope
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=self.get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=self.get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=self.get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=self.get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=self.get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=self.get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.gamma = nn.ParameterList(
+            [
+                (
+                    nn.Parameter(
+                        layer_scale_init_value * torch.ones(dim, 1), requires_grad=True
+                    )
+                    if layer_scale_init_value is not None
+                    else None
+                ),
+                (
+                    nn.Parameter(
+                        layer_scale_init_value * torch.ones(dim, 1), requires_grad=True
+                    )
+                    if layer_scale_init_value is not None
+                    else None
+                ),
+                (
+                    nn.Parameter(
+                        layer_scale_init_value * torch.ones(dim, 1), requires_grad=True
+                    )
+                    if layer_scale_init_value is not None
+                    else None
+                ),
+            ]
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for c1, c2, gamma in zip(self.convs1, self.convs2, self.gamma):
+            xt = torch.nn.functional.leaky_relu(x, negative_slope=self.lrelu_slope)
+            xt = c1(xt)
+            xt = torch.nn.functional.leaky_relu(xt, negative_slope=self.lrelu_slope)
+            xt = c2(xt)
+            if gamma is not None:
+                xt = gamma * xt
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+    @staticmethod
+    def get_padding(kernel_size: int, dilation: int = 1) -> int:
+        return int((kernel_size * dilation - dilation) / 2)
+class Backbone(nn.Module):
+    """Base class for the generator's backbone. It preserves the same temporal resolution across all layers."""
+    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor of shape (B, C, L), where B is the batch size,
+                        C denotes output features, and L is the sequence length.
+        Returns:
+            Tensor: Output of shape (B, L, H), where B is the batch size, L is the sequence length,
+                    and H denotes the model dimension.
+        """
+        raise NotImplementedError("Subclasses must implement the forward method.")
+class VocosBackbone(Backbone):
+    """
+    Vocos backbone module built with ConvNeXt blocks. Supports additional conditioning with Adaptive Layer Normalization
+    Args:
+        input_channels (int): Number of input features channels.
+        dim (int): Hidden dimension of the model.
+        intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock.
+        num_layers (int): Number of ConvNeXtBlock layers.
+        layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to `1 / num_layers`.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+                                                None means non-conditional model. Defaults to None.
+    """
+    def __init__(
+        self,
+        input_channels: int,
+        dim: int,
+        intermediate_dim: int,
+        num_layers: int,
+        layer_scale_init_value: Optional[float] = None,
+        condition_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.input_channels = input_channels
+        self.embed = nn.Conv1d(input_channels, dim, kernel_size=7, padding=3)
+        self.adanorm = condition_dim is not None
+        if condition_dim:
+            self.norm = AdaLayerNorm(condition_dim, dim, eps=1e-6)
+        else:
+            self.norm = nn.LayerNorm(dim, eps=1e-6)
+        layer_scale_init_value = layer_scale_init_value or 1 / num_layers
+        self.convnext = nn.ModuleList(
+            [
+                ConvNeXtBlock(
+                    dim=dim,
+                    intermediate_dim=intermediate_dim,
+                    layer_scale_init_value=layer_scale_init_value,
+                    condition_dim=condition_dim,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.final_layer_norm = nn.LayerNorm(dim, eps=1e-6)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv1d, nn.Linear)):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            nn.init.constant_(m.bias, 0)
+    def forward(self, x: torch.Tensor, condition: torch.Tensor = None) -> torch.Tensor:
+        x = self.embed(x)
+        if self.adanorm:
+            assert condition is not None
+            x = self.norm(x.transpose(1, 2), condition)
+        else:
+            x = self.norm(x.transpose(1, 2))
+        x = x.transpose(1, 2)
+        for conv_block in self.convnext:
+            x = conv_block(x, condition)
+        x = self.final_layer_norm(x.transpose(1, 2))
+        return x
+class VocosResNetBackbone(Backbone):
+    """
+    Vocos backbone module built with ResBlocks.
+    Args:
+        input_channels (int): Number of input features channels.
+        dim (int): Hidden dimension of the model.
+        num_blocks (int): Number of ResBlock1 blocks.
+        layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to None.
+    """
+    def __init__(
+        self,
+        input_channels,
+        dim,
+        num_blocks,
+        layer_scale_init_value=None,
+    ):
+        super().__init__()
+        self.input_channels = input_channels
+        self.embed = weight_norm(
+            nn.Conv1d(input_channels, dim, kernel_size=3, padding=1)
+        )
+        layer_scale_init_value = layer_scale_init_value or 1 / num_blocks / 3
+        self.resnet = nn.Sequential(
+            *[
+                ResBlock1(dim=dim, layer_scale_init_value=layer_scale_init_value)
+                for _ in range(num_blocks)
+            ]
+        )
+    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
+        x = self.embed(x)
+        x = self.resnet(x)
+        x = x.transpose(1, 2)
+        return x

sparktts/modules/encoder_decoder/feat_decoder.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from typing import List
+from sparktts.modules.blocks.vocos import VocosBackbone
+from sparktts.modules.blocks.samper import SamplingBlock
+class Decoder(nn.Module):
+    """Decoder module with convnext and upsampling blocks
+    Args:
+        sample_ratios (List[int]): sample ratios
+            example: [2, 2] means downsample by 2x and then upsample by 2x
+    """
+    def __init__(
+        self,
+        input_channels: int,
+        vocos_dim: int,
+        vocos_intermediate_dim: int,
+        vocos_num_layers: int,
+        out_channels: int,
+        condition_dim: int = None,
+        sample_ratios: List[int] = [1, 1],
+        use_tanh_at_final: bool = False,
+    ):
+        super().__init__()
+        self.linear_pre = nn.Linear(input_channels, vocos_dim)
+        modules = [
+            nn.Sequential(
+                SamplingBlock(
+                    dim=vocos_dim,
+                    groups=vocos_dim,
+                    upsample_scale=ratio,
+                ),
+                VocosBackbone(
+                    input_channels=vocos_dim,
+                    dim=vocos_dim,
+                    intermediate_dim=vocos_intermediate_dim,
+                    num_layers=2,
+                    condition_dim=None,
+                ),
+            )
+            for ratio in sample_ratios
+        ]
+        self.downsample = nn.Sequential(*modules)
+        self.vocos_backbone = VocosBackbone(
+            input_channels=vocos_dim,
+            dim=vocos_dim,
+            intermediate_dim=vocos_intermediate_dim,
+            num_layers=vocos_num_layers,
+            condition_dim=condition_dim,
+        )
+        self.linear = nn.Linear(vocos_dim, out_channels)
+        self.use_tanh_at_final = use_tanh_at_final
+    def forward(self, x: torch.Tensor, c: torch.Tensor = None):
+        """encoder forward.
+        Args:
+            x (torch.Tensor): (batch_size, input_channels, length)
+        Returns:
+            x (torch.Tensor): (batch_size, encode_channels, length)
+        """
+        x = self.linear_pre(x.transpose(1, 2))
+        x = self.downsample(x).transpose(1, 2)
+        x = self.vocos_backbone(x, condition=c)
+        x = self.linear(x).transpose(1, 2)
+        if self.use_tanh_at_final:
+            x = torch.tanh(x)
+        return x
+# test
+if __name__ == "__main__":
+    test_input = torch.randn(8, 1024, 50)  # Batch size = 8, 1024 channels, length = 50
+    condition = torch.randn(8, 256)
+    decoder = Decoder(
+        input_channels=1024,
+        vocos_dim=384,
+        vocos_intermediate_dim=2048,
+        vocos_num_layers=12,
+        out_channels=256,
+        condition_dim=256,
+        sample_ratios=[2, 2],
+    )
+    output = decoder(test_input, condition)
+    print(output.shape)  # torch.Size([8, 256, 200])
+    if output.shape == torch.Size([8, 256, 200]):
+        print("Decoder test passed")
+    else:
+        print("Decoder test failed")

sparktts/modules/encoder_decoder/feat_encoder.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from typing import List
+from sparktts.modules.blocks.vocos import VocosBackbone
+from sparktts.modules.blocks.samper import SamplingBlock
+class Encoder(nn.Module):
+    """Encoder module with convnext and downsampling blocks"""
+    def __init__(
+        self,
+        input_channels: int,
+        vocos_dim: int,
+        vocos_intermediate_dim: int,
+        vocos_num_layers: int,
+        out_channels: int,
+        sample_ratios: List[int] = [1, 1],
+    ):
+        super().__init__()
+        """
+        Encoder module with VocosBackbone and sampling blocks.
+        Args:
+            sample_ratios (List[int]): sample ratios
+                example: [2, 2] means downsample by 2x and then upsample by 2x
+        """
+        self.encoder = VocosBackbone(
+            input_channels=input_channels,
+            dim=vocos_dim,
+            intermediate_dim=vocos_intermediate_dim,
+            num_layers=vocos_num_layers,
+            condition_dim=None,
+        )
+        modules = [
+            nn.Sequential(
+                SamplingBlock(
+                    dim=vocos_dim,
+                    groups=vocos_dim,
+                    downsample_scale=ratio,
+                ),
+                VocosBackbone(
+                    input_channels=vocos_dim,
+                    dim=vocos_dim,
+                    intermediate_dim=vocos_intermediate_dim,
+                    num_layers=2,
+                    condition_dim=None,
+                ),
+            )
+            for ratio in sample_ratios
+        ]
+        self.downsample = nn.Sequential(*modules)
+        self.project = nn.Linear(vocos_dim, out_channels)
+    def forward(self, x: torch.Tensor, *args):
+        """
+        Args:
+            x (torch.Tensor): (batch_size, input_channels, length)
+        Returns:
+            x (torch.Tensor): (batch_size, encode_channels, length)
+        """
+        x = self.encoder(x)
+        x = self.downsample(x)
+        x = self.project(x)
+        return x.transpose(1, 2)
+# test
+if __name__ == "__main__":
+    test_input = torch.randn(8, 1024, 50)  # Batch size = 8, 1024 channels, length = 50
+    encoder = Encoder(
+        input_channels=1024,
+        vocos_dim=384,
+        vocos_intermediate_dim=2048,
+        vocos_num_layers=12,
+        out_channels=256,
+        sample_ratios=[2, 2],
+    )
+    output = encoder(test_input)
+    print(output.shape)  # torch.Size([8, 256, 12])
+    if output.shape == torch.Size([8, 256, 12]):
+        print("test successful")

sparktts/modules/encoder_decoder/wave_generator.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) 2024 Xinsheng Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/descriptinc/descript-audio-codec under the Apache License 2.0
+import torch.nn as nn
+from sparktts.modules.blocks.layers import (
+    Snake1d,
+    WNConv1d,
+    ResidualUnit,
+    WNConvTranspose1d,
+    init_weights,
+)
+class DecoderBlock(nn.Module):
+    def __init__(
+        self,
+        input_dim: int = 16,
+        output_dim: int = 8,
+        kernel_size: int = 2,
+        stride: int = 1,
+    ):
+        super().__init__()
+        self.block = nn.Sequential(
+            Snake1d(input_dim),
+            WNConvTranspose1d(
+                input_dim,
+                output_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=(kernel_size - stride) // 2,
+            ),
+            ResidualUnit(output_dim, dilation=1),
+            ResidualUnit(output_dim, dilation=3),
+            ResidualUnit(output_dim, dilation=9),
+        )
+    def forward(self, x):
+        return self.block(x)
+class WaveGenerator(nn.Module):
+    def __init__(
+        self,
+        input_channel,
+        channels,
+        rates,
+        kernel_sizes,
+        d_out: int = 1,
+    ):
+        super().__init__()
+        # Add first conv layer
+        layers = [WNConv1d(input_channel, channels, kernel_size=7, padding=3)]
+        # Add upsampling + MRF blocks
+        for i, (kernel_size, stride) in enumerate(zip(kernel_sizes, rates)):
+            input_dim = channels // 2**i
+            output_dim = channels // 2 ** (i + 1)
+            layers += [DecoderBlock(input_dim, output_dim, kernel_size, stride)]
+        # Add final conv layer
+        layers += [
+            Snake1d(output_dim),
+            WNConv1d(output_dim, d_out, kernel_size=7, padding=3),
+            nn.Tanh(),
+        ]
+        self.model = nn.Sequential(*layers)
+        self.apply(init_weights)
+    def forward(self, x):
+        return self.model(x)

sparktts/modules/fsq/finite_scalar_quantization.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""
+Finite Scalar Quantization: VQ-VAE Made Simple - https://arxiv.org/abs/2309.15505
+Code adapted from Jax version in Appendix A.1
+"""
+from __future__ import annotations
+from functools import wraps, partial
+from contextlib import nullcontext
+from typing import List, Tuple
+import torch
+import torch.nn as nn
+from torch.nn import Module
+from torch import Tensor, int32
+from torch.amp import autocast
+from einops import rearrange, pack, unpack
+# helper functions
+def exists(v):
+    return v is not None
+def default(*args):
+    for arg in args:
+        if exists(arg):
+            return arg
+    return None
+def maybe(fn):
+    @wraps(fn)
+    def inner(x, *args, **kwargs):
+        if not exists(x):
+            return x
+        return fn(x, *args, **kwargs)
+    return inner
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+# tensor helpers
+def round_ste(z: Tensor) -> Tensor:
+    """Round with straight through gradients."""
+    zhat = z.round()
+    return z + (zhat - z).detach()
+# main class
+class FSQ(Module):
+    def __init__(
+        self,
+        levels: List[int],
+        dim: int | None = None,
+        num_codebooks=1,
+        keep_num_codebooks_dim: bool | None = None,
+        scale: float | None = None,
+        allowed_dtypes: Tuple[torch.dtype, ...] = (torch.float32, torch.float64),
+        channel_first: bool = False,
+        projection_has_bias: bool = True,
+        return_indices=True,
+        force_quantization_f32=True,
+    ):
+        super().__init__()
+        _levels = torch.tensor(levels, dtype=int32)
+        self.register_buffer("_levels", _levels, persistent=False)
+        _basis = torch.cumprod(torch.tensor([1] + levels[:-1]), dim=0, dtype=int32)
+        self.register_buffer("_basis", _basis, persistent=False)
+        self.scale = scale
+        codebook_dim = len(levels)
+        self.codebook_dim = codebook_dim
+        effective_codebook_dim = codebook_dim * num_codebooks
+        self.num_codebooks = num_codebooks
+        self.effective_codebook_dim = effective_codebook_dim
+        keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1)
+        assert not (num_codebooks > 1 and not keep_num_codebooks_dim)
+        self.keep_num_codebooks_dim = keep_num_codebooks_dim
+        self.dim = default(dim, len(_levels) * num_codebooks)
+        self.channel_first = channel_first
+        has_projections = self.dim != effective_codebook_dim
+        self.project_in = (
+            nn.Linear(self.dim, effective_codebook_dim, bias=projection_has_bias)
+            if has_projections
+            else nn.Identity()
+        )
+        self.project_out = (
+            nn.Linear(effective_codebook_dim, self.dim, bias=projection_has_bias)
+            if has_projections
+            else nn.Identity()
+        )
+        self.has_projections = has_projections
+        self.return_indices = return_indices
+        if return_indices:
+            self.codebook_size = self._levels.prod().item()
+            implicit_codebook = self._indices_to_codes(torch.arange(self.codebook_size))
+            self.register_buffer(
+                "implicit_codebook", implicit_codebook, persistent=False
+            )
+        self.allowed_dtypes = allowed_dtypes
+        self.force_quantization_f32 = force_quantization_f32
+    def bound(self, z, eps: float = 1e-3):
+        """Bound `z`, an array of shape (..., d)."""
+        half_l = (self._levels - 1) * (1 + eps) / 2
+        offset = torch.where(self._levels % 2 == 0, 0.5, 0.0)
+        shift = (offset / half_l).atanh()
+        return (z + shift).tanh() * half_l - offset
+    def quantize(self, z):
+        """Quantizes z, returns quantized zhat, same shape as z."""
+        quantized = round_ste(self.bound(z))
+        half_width = self._levels // 2  # Renormalize to [-1, 1].
+        return quantized / half_width
+    def _scale_and_shift(self, zhat_normalized):
+        half_width = self._levels // 2
+        return (zhat_normalized * half_width) + half_width
+    def _scale_and_shift_inverse(self, zhat):
+        half_width = self._levels // 2
+        return (zhat - half_width) / half_width
+    def _indices_to_codes(self, indices):
+        level_indices = self.indices_to_level_indices(indices)
+        codes = self._scale_and_shift_inverse(level_indices)
+        return codes
+    def codes_to_indices(self, zhat):
+        """Converts a `code` to an index in the codebook."""
+        assert zhat.shape[-1] == self.codebook_dim
+        zhat = self._scale_and_shift(zhat)
+        return (zhat * self._basis).sum(dim=-1).to(int32)
+    def indices_to_level_indices(self, indices):
+        """Converts indices to indices at each level, perhaps needed for a transformer with factorized embeddings"""
+        indices = rearrange(indices, "... -> ... 1")
+        codes_non_centered = (indices // self._basis) % self._levels
+        return codes_non_centered
+    def indices_to_codes(self, indices):
+        """Inverse of `codes_to_indices`."""
+        assert exists(indices)
+        is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim))
+        codes = self._indices_to_codes(indices)
+        if self.keep_num_codebooks_dim:
+            codes = rearrange(codes, "... c d -> ... (c d)")
+        codes = self.project_out(codes)
+        if is_img_or_video or self.channel_first:
+            codes = rearrange(codes, "b ... d -> b d ...")
+        return codes
+    def forward(self, z):
+        """
+        einstein notation
+        b - batch
+        n - sequence (or flattened spatial dimensions)
+        d - feature dimension
+        c - number of codebook dim
+        """
+        is_img_or_video = z.ndim >= 4
+        need_move_channel_last = is_img_or_video or self.channel_first
+        # standardize image or video into (batch, seq, dimension)
+        if need_move_channel_last:
+            z = rearrange(z, "b d ... -> b ... d")
+            z, ps = pack_one(z, "b * d")
+        assert (
+            z.shape[-1] == self.dim
+        ), f"expected dimension of {self.dim} but found dimension of {z.shape[-1]}"
+        z = self.project_in(z)
+        z = rearrange(z, "b n (c d) -> b n c d", c=self.num_codebooks)
+        # whether to force quantization step to be full precision or not
+        force_f32 = self.force_quantization_f32
+        quantization_context = (
+            partial(autocast, "cuda", enabled=False) if force_f32 else nullcontext
+        )
+        with quantization_context():
+            orig_dtype = z.dtype
+            if force_f32 and orig_dtype not in self.allowed_dtypes:
+                z = z.float()
+            codes = self.quantize(z)
+            # returning indices could be optional
+            indices = None
+            if self.return_indices:
+                indices = self.codes_to_indices(codes)
+            codes = rearrange(codes, "b n c d -> b n (c d)")
+            codes = codes.type(orig_dtype)
+        # project out
+        out = self.project_out(codes)
+        # reconstitute image or video dimensions
+        if need_move_channel_last:
+            out = unpack_one(out, ps, "b * d")
+            out = rearrange(out, "b ... d -> b d ...")
+            indices = maybe(unpack_one)(indices, ps, "b * c")
+        if not self.keep_num_codebooks_dim and self.return_indices:
+            indices = maybe(rearrange)(indices, "... 1 -> ...")
+        # return quantized output and indices
+        return out, indices

sparktts/modules/fsq/residual_fsq.py ADDED Viewed

	@@ -0,0 +1,355 @@

+import random
+import torch
+import torch.nn.functional as F
+import torch.distributed as dist
+from typing import List
+from torch import nn
+from torch.nn import Module
+from torch.amp import autocast
+from einx import get_at
+from einops import rearrange, reduce, pack, unpack
+from sparktts.modules.fsq.finite_scalar_quantization import FSQ
+def exists(val):
+    return val is not None
+def first(l):
+    return l[0]
+def default(val, d):
+    return val if exists(val) else d
+def round_up_multiple(num, mult):
+    return ceil(num / mult) * mult
+# distributed helpers
+def is_distributed():
+    return dist.is_initialized() and dist.get_world_size() > 1
+def get_maybe_sync_seed(device, max_size=10_000):
+    rand_int = torch.randint(0, max_size, (), device=device)
+    if is_distributed():
+        dist.all_reduce(rand_int)
+    return rand_int.item()
+class ResidualFSQ(Module):
+    """Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf"""
+    def __init__(
+        self,
+        *,
+        levels: List[int],
+        num_quantizers,
+        dim=None,
+        is_channel_first=False,
+        quantize_dropout=False,
+        quantize_dropout_cutoff_index=0,
+        quantize_dropout_multiple_of=1,
+        **kwargs,
+    ):
+        super().__init__()
+        codebook_dim = len(levels)
+        dim = default(dim, codebook_dim)
+        requires_projection = codebook_dim != dim
+        self.project_in = (
+            nn.Linear(dim, codebook_dim) if requires_projection else nn.Identity()
+        )
+        self.project_out = (
+            nn.Linear(codebook_dim, dim) if requires_projection else nn.Identity()
+        )
+        self.has_projections = requires_projection
+        self.is_channel_first = is_channel_first
+        self.num_quantizers = num_quantizers
+        self.levels = levels
+        self.layers = nn.ModuleList([])
+        levels_tensor = torch.Tensor(levels)
+        scales = []
+        for ind in range(num_quantizers):
+            scales.append((levels_tensor - 1) ** -ind)
+            fsq = FSQ(levels=levels, dim=codebook_dim, **kwargs)
+            self.layers.append(fsq)
+        assert all([not fsq.has_projections for fsq in self.layers])
+        self.codebook_size = self.layers[0].codebook_size
+        self.register_buffer("scales", torch.stack(scales), persistent=False)
+        self.quantize_dropout = quantize_dropout and num_quantizers > 1
+        assert quantize_dropout_cutoff_index >= 0
+        self.quantize_dropout_cutoff_index = quantize_dropout_cutoff_index
+        self.quantize_dropout_multiple_of = quantize_dropout_multiple_of  # encodec paper proposes structured dropout, believe this was set to 4
+    @property
+    def codebooks(self):
+        codebooks = [layer.implicit_codebook for layer in self.layers]
+        codebooks = torch.stack(codebooks, dim=0)
+        return codebooks
+    def get_codes_from_indices(self, indices):
+        batch, quantize_dim = indices.shape[0], indices.shape[-1]
+        # may also receive indices in the shape of 'b h w q' (accept_image_fmap)
+        indices, ps = pack([indices], "b * q")
+        # because of quantize dropout, one can pass in indices that are coarse
+        # and the network should be able to reconstruct
+        if quantize_dim < self.num_quantizers:
+            assert (
+                self.quantize_dropout > 0.0
+            ), "quantize dropout must be greater than 0 if you wish to reconstruct from a signal with less fine quantizations"
+            indices = F.pad(indices, (0, self.num_quantizers - quantize_dim), value=-1)
+        # take care of quantizer dropout
+        mask = indices == -1
+        indices = indices.masked_fill(
+            mask, 0
+        )  # have it fetch a dummy code to be masked out later
+        all_codes = get_at("q [c] d, b n q -> q b n d", self.codebooks, indices)
+        # mask out any codes that were dropout-ed
+        all_codes = all_codes.masked_fill(rearrange(mask, "b n q -> q b n 1"), 0.0)
+        # scale the codes
+        scales = rearrange(self.scales, "q d -> q 1 1 d")
+        all_codes = all_codes * scales
+        # if (accept_image_fmap = True) then return shape (quantize, batch, height, width, dimension)
+        (all_codes,) = unpack(all_codes, ps, "q b * d")
+        return all_codes
+    def get_output_from_indices(self, indices):
+        codes = self.get_codes_from_indices(indices)
+        codes_summed = reduce(codes, "q ... -> ...", "sum")
+        return self.project_out(codes_summed)
+    def forward(self, x, return_all_codes=False, rand_quantize_dropout_fixed_seed=None):
+        num_quant, quant_dropout_multiple_of, device = (
+            self.num_quantizers,
+            self.quantize_dropout_multiple_of,
+            x.device,
+        )
+        # handle channel first
+        if self.is_channel_first:
+            x = rearrange(x, "b d ... -> b ... d")
+            x, ps = pack([x], "b * d")
+        # maybe project in
+        x = self.project_in(x)
+        quantized_out = 0.0
+        residual = x
+        all_indices = []
+        should_quantize_dropout = self.training and self.quantize_dropout
+        # sample a layer index at which to dropout further residual quantization
+        # also prepare null indices
+        if should_quantize_dropout:
+            # check if seed is manually passed in
+            if not exists(rand_quantize_dropout_fixed_seed):
+                rand_quantize_dropout_fixed_seed = get_maybe_sync_seed(device)
+            rand = random.Random(rand_quantize_dropout_fixed_seed)
+            rand_quantize_dropout_index = rand.randrange(
+                self.quantize_dropout_cutoff_index, num_quant
+            )
+            if quant_dropout_multiple_of != 1:
+                rand_quantize_dropout_index = (
+                    round_up_multiple(
+                        rand_quantize_dropout_index + 1, quant_dropout_multiple_of
+                    )
+                    - 1
+                )
+            null_indices = torch.full(
+                x.shape[:2], -1.0, device=device, dtype=torch.long
+            )
+        # go through the layers
+        with autocast("cuda", enabled=False):
+            for quantizer_index, (layer, scale) in enumerate(
+                zip(self.layers, self.scales)
+            ):
+                if (
+                    should_quantize_dropout
+                    and quantizer_index > rand_quantize_dropout_index
+                ):
+                    all_indices.append(null_indices)
+                    continue
+                quantized, indices = layer(residual / scale)
+                quantized = quantized * scale
+                residual = residual - quantized.detach()
+                quantized_out = quantized_out + quantized
+                all_indices.append(indices)
+        # project out, if needed
+        quantized_out = self.project_out(quantized_out)
+        # stack all indices
+        all_indices = torch.stack(all_indices, dim=-1)
+        # channel first out
+        if self.is_channel_first:
+            (quantized_out,) = unpack(quantized_out, ps, "b * d")
+            (all_indices,) = unpack(all_indices, ps, "b * d")
+            quantized_out = rearrange(quantized_out, "b ... d -> b d ...")
+            all_indices = rearrange(all_indices, "b ... d -> b d ...")
+        # return
+        ret = (quantized_out, all_indices)
+        if not return_all_codes:
+            return ret
+        # whether to return all codes from all codebooks across layers
+        all_codes = self.get_codes_from_indices(all_indices)
+        # will return all codes in shape (quantizer, batch, sequence length, codebook dimension)
+        return (*ret, all_codes)
+# grouped residual fsq
+class GroupedResidualFSQ(Module):
+    def __init__(self, *, dim, groups=1, accept_image_fmap=False, **kwargs):
+        super().__init__()
+        self.dim = dim
+        self.groups = groups
+        assert (dim % groups) == 0
+        dim_per_group = dim // groups
+        self.accept_image_fmap = accept_image_fmap
+        self.rvqs = nn.ModuleList([])
+        for _ in range(groups):
+            self.rvqs.append(ResidualFSQ(dim=dim_per_group, **kwargs))
+        self.codebook_size = self.rvqs[0].codebook_size
+    @property
+    def codebooks(self):
+        return torch.stack(tuple(rvq.codebooks for rvq in self.rvqs))
+    @property
+    def split_dim(self):
+        return 1 if self.accept_image_fmap else -1
+    def get_codes_from_indices(self, indices):
+        codes = tuple(
+            rvq.get_codes_from_indices(chunk_indices)
+            for rvq, chunk_indices in zip(self.rvqs, indices)
+        )
+        return torch.stack(codes)
+    def get_output_from_indices(self, indices):
+        outputs = tuple(
+            rvq.get_output_from_indices(chunk_indices)
+            for rvq, chunk_indices in zip(self.rvqs, indices)
+        )
+        return torch.cat(outputs, dim=self.split_dim)
+    def forward(self, x, return_all_codes=False):
+        shape, split_dim, device = x.shape, self.split_dim, x.device
+        assert shape[split_dim] == self.dim
+        # split the feature dimension into groups
+        x = x.chunk(self.groups, dim=split_dim)
+        forward_kwargs = dict(
+            return_all_codes=return_all_codes,
+            rand_quantize_dropout_fixed_seed=(
+                get_maybe_sync_seed(device) if self.training else None
+            ),
+        )
+        # invoke residual vq on each group
+        out = tuple(rvq(chunk, **forward_kwargs) for rvq, chunk in zip(self.rvqs, x))
+        out = tuple(zip(*out))
+        # otherwise, get all the zipped outputs and combine them
+        quantized, all_indices, *maybe_all_codes = out
+        quantized = torch.cat(quantized, dim=split_dim)
+        all_indices = torch.stack(all_indices)
+        ret = (quantized, all_indices, *maybe_all_codes)
+        return ret
+if __name__ == "__main__":
+    model = ResidualFSQ(
+        levels=[4, 4, 4, 4, 4, 4],
+        num_quantizers=1,
+        dim=30,
+        is_channel_first=True,
+        quantize_dropout=False,
+    )
+    x = torch.randn(2, 30, 10)
+    quantize, embed_ind = model(x)
+    emb_from_ind = model.get_output_from_indices(embed_ind.transpose(1, 2))
+    print(quantize == emb_from_ind.transpose(1, 2))
+    print("quantize shape", quantize.shape)
+    print("embed_ind", embed_ind)

sparktts/modules/speaker/ecapa_tdnn.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# Copyright (c) 2021 Zhengyang Chen ([email protected])
+#               2022 Hongji Wang ([email protected])
+#               2023 Bing Han ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" This implementation is adapted from github repo:
+    https://github.com/lawlict/ECAPA-TDNN.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import sparktts.modules.speaker.pooling_layers as pooling_layers
+class Res2Conv1dReluBn(nn.Module):
+    """
+    in_channels == out_channels == channels
+    """
+    def __init__(
+        self,
+        channels,
+        kernel_size=1,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=True,
+        scale=4,
+    ):
+        super().__init__()
+        assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
+        self.scale = scale
+        self.width = channels // scale
+        self.nums = scale if scale == 1 else scale - 1
+        self.convs = []
+        self.bns = []
+        for i in range(self.nums):
+            self.convs.append(
+                nn.Conv1d(
+                    self.width,
+                    self.width,
+                    kernel_size,
+                    stride,
+                    padding,
+                    dilation,
+                    bias=bias,
+                )
+            )
+            self.bns.append(nn.BatchNorm1d(self.width))
+        self.convs = nn.ModuleList(self.convs)
+        self.bns = nn.ModuleList(self.bns)
+    def forward(self, x):
+        out = []
+        spx = torch.split(x, self.width, 1)
+        sp = spx[0]
+        for i, (conv, bn) in enumerate(zip(self.convs, self.bns)):
+            # Order: conv -> relu -> bn
+            if i >= 1:
+                sp = sp + spx[i]
+            sp = conv(sp)
+            sp = bn(F.relu(sp))
+            out.append(sp)
+        if self.scale != 1:
+            out.append(spx[self.nums])
+        out = torch.cat(out, dim=1)
+        return out
+""" Conv1d + BatchNorm1d + ReLU
+"""
+class Conv1dReluBn(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=1,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=True,
+    ):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias
+        )
+        self.bn = nn.BatchNorm1d(out_channels)
+    def forward(self, x):
+        return self.bn(F.relu(self.conv(x)))
+""" The SE connection of 1D case.
+"""
+class SE_Connect(nn.Module):
+    def __init__(self, channels, se_bottleneck_dim=128):
+        super().__init__()
+        self.linear1 = nn.Linear(channels, se_bottleneck_dim)
+        self.linear2 = nn.Linear(se_bottleneck_dim, channels)
+    def forward(self, x):
+        out = x.mean(dim=2)
+        out = F.relu(self.linear1(out))
+        out = torch.sigmoid(self.linear2(out))
+        out = x * out.unsqueeze(2)
+        return out
+""" SE-Res2Block of the ECAPA-TDNN architecture.
+"""
+class SE_Res2Block(nn.Module):
+    def __init__(self, channels, kernel_size, stride, padding, dilation, scale):
+        super().__init__()
+        self.se_res2block = nn.Sequential(
+            Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
+            Res2Conv1dReluBn(
+                channels, kernel_size, stride, padding, dilation, scale=scale
+            ),
+            Conv1dReluBn(channels, channels, kernel_size=1, stride=1, padding=0),
+            SE_Connect(channels),
+        )
+    def forward(self, x):
+        return x + self.se_res2block(x)
+class ECAPA_TDNN(nn.Module):
+    def __init__(
+        self,
+        channels=512,
+        feat_dim=80,
+        embed_dim=192,
+        pooling_func="ASTP",
+        global_context_att=False,
+        emb_bn=False,
+    ):
+        super().__init__()
+        self.layer1 = Conv1dReluBn(feat_dim, channels, kernel_size=5, padding=2)
+        self.layer2 = SE_Res2Block(
+            channels, kernel_size=3, stride=1, padding=2, dilation=2, scale=8
+        )
+        self.layer3 = SE_Res2Block(
+            channels, kernel_size=3, stride=1, padding=3, dilation=3, scale=8
+        )
+        self.layer4 = SE_Res2Block(
+            channels, kernel_size=3, stride=1, padding=4, dilation=4, scale=8
+        )
+        cat_channels = channels * 3
+        out_channels = 512 * 3
+        self.conv = nn.Conv1d(cat_channels, out_channels, kernel_size=1)
+        self.pool = getattr(pooling_layers, pooling_func)(
+            in_dim=out_channels, global_context_att=global_context_att
+        )
+        self.pool_out_dim = self.pool.get_out_dim()
+        self.bn = nn.BatchNorm1d(self.pool_out_dim)
+        self.linear = nn.Linear(self.pool_out_dim, embed_dim)
+        self.emb_bn = emb_bn
+        if emb_bn:  # better in SSL for SV
+            self.bn2 = nn.BatchNorm1d(embed_dim)
+        else:
+            self.bn2 = nn.Identity()
+    def forward(self, x, return_latent=False):
+        x = x.permute(0, 2, 1)  # (B,T,F) -> (B,F,T)
+        out1 = self.layer1(x)
+        out2 = self.layer2(out1)
+        out3 = self.layer3(out2)
+        out4 = self.layer4(out3)
+        out = torch.cat([out2, out3, out4], dim=1)
+        latent = F.relu(self.conv(out))
+        out = self.bn(self.pool(latent))
+        out = self.linear(out)
+        if self.emb_bn:
+            out = self.bn2(out)
+        if return_latent:
+            return out, latent
+        return out
+def ECAPA_TDNN_c1024(feat_dim, embed_dim, pooling_func="ASTP", emb_bn=False):
+    return ECAPA_TDNN(
+        channels=1024,
+        feat_dim=feat_dim,
+        embed_dim=embed_dim,
+        pooling_func=pooling_func,
+        emb_bn=emb_bn,
+    )
+def ECAPA_TDNN_GLOB_c1024(feat_dim, embed_dim, pooling_func="ASTP", emb_bn=False):
+    return ECAPA_TDNN(
+        channels=1024,
+        feat_dim=feat_dim,
+        embed_dim=embed_dim,
+        pooling_func=pooling_func,
+        global_context_att=True,
+        emb_bn=emb_bn,
+    )
+def ECAPA_TDNN_c512(feat_dim, embed_dim, pooling_func="ASTP", emb_bn=False):
+    return ECAPA_TDNN(
+        channels=512,
+        feat_dim=feat_dim,
+        embed_dim=embed_dim,
+        pooling_func=pooling_func,
+        emb_bn=emb_bn,
+    )
+def ECAPA_TDNN_GLOB_c512(feat_dim, embed_dim, pooling_func="ASTP", emb_bn=False):
+    return ECAPA_TDNN(
+        channels=512,
+        feat_dim=feat_dim,
+        embed_dim=embed_dim,
+        pooling_func=pooling_func,
+        global_context_att=True,
+        emb_bn=emb_bn,
+    )
+if __name__ == "__main__":
+    x = torch.zeros(1, 200, 100)
+    model = ECAPA_TDNN_GLOB_c512(feat_dim=100, embed_dim=256, pooling_func="ASTP")
+    model.eval()
+    out, latent = model(x, True)
+    print(out.shape)
+    print(latent.shape)
+    num_params = sum(param.numel() for param in model.parameters())
+    print("{} M".format(num_params / 1e6))
+    # from thop import profile
+    # x_np = torch.randn(1, 200, 80)
+    # flops, params = profile(model, inputs=(x_np, ))
+    # print("FLOPs: {} G, Params: {} M".format(flops / 1e9, params / 1e6))

sparktts/modules/speaker/perceiver_encoder.py ADDED Viewed

	@@ -0,0 +1,360 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/lucidrains/naturalspeech2-pytorch/blob/659bec7f7543e7747e809e950cc2f84242fbeec7/naturalspeech2_pytorch/naturalspeech2_pytorch.py#L532
+from collections import namedtuple
+from functools import wraps
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from einops.layers.torch import Rearrange
+from packaging import version
+from torch import einsum, nn
+def exists(val):
+    return val is not None
+def once(fn):
+    called = False
+    @wraps(fn)
+    def inner(x):
+        nonlocal called
+        if called:
+            return
+        called = True
+        return fn(x)
+    return inner
+print_once = once(print)
+# main class
+class Attend(nn.Module):
+    def __init__(self, dropout=0.0, causal=False, use_flash=False):
+        super().__init__()
+        self.dropout = dropout
+        self.attn_dropout = nn.Dropout(dropout)
+        self.causal = causal
+        self.register_buffer("mask", None, persistent=False)
+        self.use_flash = use_flash
+        assert not (
+            use_flash and version.parse(torch.__version__) < version.parse("2.0.0")
+        ), "in order to use flash attention, you must be using pytorch 2.0 or above"
+        # determine efficient attention configs for cuda and cpu
+        self.config = namedtuple(
+            "EfficientAttentionConfig",
+            ["enable_flash", "enable_math", "enable_mem_efficient"],
+        )
+        self.cpu_config = self.config(True, True, True)
+        self.cuda_config = None
+        if not torch.cuda.is_available() or not use_flash:
+            return
+        device_properties = torch.cuda.get_device_properties(torch.device("cuda"))
+        if device_properties.major == 8 and device_properties.minor == 0:
+            print_once(
+                "A100 GPU detected, using flash attention if input tensor is on cuda"
+            )
+            self.cuda_config = self.config(True, False, False)
+        else:
+            print_once(
+                "Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda"
+            )
+            self.cuda_config = self.config(False, True, True)
+    def get_mask(self, n, device):
+        if exists(self.mask) and self.mask.shape[-1] >= n:
+            return self.mask[:n, :n]
+        mask = torch.ones((n, n), device=device, dtype=torch.bool).triu(1)
+        self.register_buffer("mask", mask, persistent=False)
+        return mask
+    def flash_attn(self, q, k, v, mask=None):
+        _, heads, q_len, _, k_len, is_cuda = *q.shape, k.shape[-2], q.is_cuda
+        # Recommended for multi-query single-key-value attention by Tri Dao
+        # kv shape torch.Size([1, 512, 64]) -> torch.Size([1, 8, 512, 64])
+        if k.ndim == 3:
+            k = rearrange(k, "b ... -> b 1 ...").expand_as(q)
+        if v.ndim == 3:
+            v = rearrange(v, "b ... -> b 1 ...").expand_as(q)
+        # Check if mask exists and expand to compatible shape
+        # The mask is B L, so it would have to be expanded to B H N L
+        if exists(mask):
+            mask = rearrange(mask, "b j -> b 1 1 j")
+            mask = mask.expand(-1, heads, q_len, -1)
+        # Check if there is a compatible device for flash attention
+        config = self.cuda_config if is_cuda else self.cpu_config
+        # pytorch 2.0 flash attn: q, k, v, mask, dropout, causal, softmax_scale
+        with torch.backends.cuda.sdp_kernel(**config._asdict()):
+            out = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=mask,
+                dropout_p=self.dropout if self.training else 0.0,
+                is_causal=self.causal,
+            )
+        return out
+    def forward(self, q, k, v, mask=None):
+        """
+        einstein notation
+        b - batch
+        h - heads
+        n, i, j - sequence length (base sequence length, source, target)
+        d - feature dimension
+        """
+        n, device = q.shape[-2], q.device
+        scale = q.shape[-1] ** -0.5
+        if self.use_flash:
+            return self.flash_attn(q, k, v, mask=mask)
+        kv_einsum_eq = "b j d" if k.ndim == 3 else "b h j d"
+        # similarity
+        sim = einsum(f"b h i d, {kv_einsum_eq} -> b h i j", q, k) * scale
+        # key padding mask
+        if exists(mask):
+            mask = rearrange(mask, "b j -> b 1 1 j")
+            sim = sim.masked_fill(~mask, -torch.finfo(sim.dtype).max)
+        # causal mask
+        if self.causal:
+            causal_mask = self.get_mask(n, device)
+            sim = sim.masked_fill(causal_mask, -torch.finfo(sim.dtype).max)
+        # attention
+        attn = sim.softmax(dim=-1)
+        attn = self.attn_dropout(attn)
+        # aggregate values
+        out = einsum(f"b h i j, {kv_einsum_eq} -> b h i d", attn, v)
+        return out
+def Sequential(*mods):
+    return nn.Sequential(*filter(exists, mods))
+def exists(x):
+    return x is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+class RMSNorm(nn.Module):
+    def __init__(self, dim, scale=True, dim_cond=None):
+        super().__init__()
+        self.cond = exists(dim_cond)
+        self.to_gamma_beta = nn.Linear(dim_cond, dim * 2) if self.cond else None
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(dim)) if scale else None
+    def forward(self, x, cond=None):
+        gamma = default(self.gamma, 1)
+        out = F.normalize(x, dim=-1) * self.scale * gamma
+        if not self.cond:
+            return out
+        assert exists(cond)
+        gamma, beta = self.to_gamma_beta(cond).chunk(2, dim=-1)
+        gamma, beta = map(lambda t: rearrange(t, "b d -> b 1 d"), (gamma, beta))
+        return out * gamma + beta
+class CausalConv1d(nn.Conv1d):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        (kernel_size,) = self.kernel_size
+        (dilation,) = self.dilation
+        (stride,) = self.stride
+        assert stride == 1
+        self.causal_padding = dilation * (kernel_size - 1)
+    def forward(self, x):
+        causal_padded_x = F.pad(x, (self.causal_padding, 0), value=0.0)
+        return super().forward(causal_padded_x)
+class GEGLU(nn.Module):
+    def forward(self, x):
+        x, gate = x.chunk(2, dim=-1)
+        return F.gelu(gate) * x
+def FeedForward(dim, mult=4, causal_conv=False):
+    dim_inner = int(dim * mult * 2 / 3)
+    conv = None
+    if causal_conv:
+        conv = nn.Sequential(
+            Rearrange("b n d -> b d n"),
+            CausalConv1d(dim_inner, dim_inner, 3),
+            Rearrange("b d n -> b n d"),
+        )
+    return Sequential(
+        nn.Linear(dim, dim_inner * 2), GEGLU(), conv, nn.Linear(dim_inner, dim)
+    )
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        dim_context=None,
+        causal=False,
+        dim_head=64,
+        heads=8,
+        dropout=0.0,
+        use_flash=False,
+        cross_attn_include_queries=False,
+    ):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.cross_attn_include_queries = cross_attn_include_queries
+        dim_inner = dim_head * heads
+        dim_context = default(dim_context, dim)
+        self.attend = Attend(causal=causal, dropout=dropout, use_flash=use_flash)
+        self.to_q = nn.Linear(dim, dim_inner, bias=False)
+        self.to_kv = nn.Linear(dim_context, dim_inner * 2, bias=False)
+        self.to_out = nn.Linear(dim_inner, dim, bias=False)
+    def forward(self, x, context=None, mask=None):
+        h, has_context = self.heads, exists(context)
+        context = default(context, x)
+        if has_context and self.cross_attn_include_queries:
+            context = torch.cat((x, context), dim=-2)
+        q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, dim=-1))
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
+        out = self.attend(q, k, v, mask=mask)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class PerceiverResampler(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=2,
+        dim_context=None,
+        num_latents=32,
+        dim_head=64,
+        heads=8,
+        ff_mult=4,
+        use_flash_attn=False,
+    ):
+        super().__init__()
+        dim_context = default(dim_context, dim)
+        self.proj_context = (
+            nn.Linear(dim_context, dim) if dim_context != dim else nn.Identity()
+        )
+        self.latents = nn.Parameter(torch.randn(num_latents, dim))
+        nn.init.normal_(self.latents, std=0.02)
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        Attention(
+                            dim=dim,
+                            dim_head=dim_head,
+                            heads=heads,
+                            use_flash=use_flash_attn,
+                            cross_attn_include_queries=True,
+                        ),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+        self.norm = RMSNorm(dim)
+    def forward(self, x, mask=None):
+        batch = x.shape[0]
+        x = self.proj_context(x)
+        latents = repeat(self.latents, "n d -> b n d", b=batch)
+        for attn, ff in self.layers:
+            latents = attn(latents, x, mask=mask) + latents
+            latents = ff(latents) + latents
+        return self.norm(latents)
+if __name__ == "__main__":
+    model = PerceiverResampler(dim=256, dim_context=80)
+    x = torch.randn(8, 200, 80)
+    out = model(x)
+    print(out.shape)  # [8, 32, 80]
+    num_params = sum(param.numel() for param in model.parameters())
+    print("{} M".format(num_params / 1e6))

sparktts/modules/speaker/pooling_layers.py ADDED Viewed

	@@ -0,0 +1,298 @@

+# Copyright (c) 2021 Shuai Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Pooling functions to aggregate frame-level deep features
+into segment-level speaker embeddings
+High-order statistics are surprisingly effective, TSDP acts similarly as TSTP,
+even though we remove the mean statistic, on Voxceleb.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class TAP(nn.Module):
+    """
+    Temporal average pooling, only first-order mean is considered
+    """
+    def __init__(self, in_dim=0, **kwargs):
+        super(TAP, self).__init__()
+        self.in_dim = in_dim
+    def forward(self, x):
+        pooling_mean = x.mean(dim=-1)
+        # To be compatable with 2D input
+        pooling_mean = pooling_mean.flatten(start_dim=1)
+        return pooling_mean
+    def get_out_dim(self):
+        self.out_dim = self.in_dim
+        return self.out_dim
+class TSDP(nn.Module):
+    """
+    Temporal standard deviation pooling, only second-order std is considered
+    """
+    def __init__(self, in_dim=0, **kwargs):
+        super(TSDP, self).__init__()
+        self.in_dim = in_dim
+    def forward(self, x):
+        # The last dimension is the temporal axis
+        pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-7)
+        pooling_std = pooling_std.flatten(start_dim=1)
+        return pooling_std
+    def get_out_dim(self):
+        self.out_dim = self.in_dim
+        return self.out_dim
+class TSTP(nn.Module):
+    """
+    Temporal statistics pooling, concatenate mean and std, which is used in
+    x-vector
+    Comment: simple concatenation can not make full use of both statistics
+    """
+    def __init__(self, in_dim=0, **kwargs):
+        super(TSTP, self).__init__()
+        self.in_dim = in_dim
+    def forward(self, x):
+        # The last dimension is the temporal axis
+        pooling_mean = x.mean(dim=-1)
+        pooling_std = torch.sqrt(torch.var(x, dim=-1) + 1e-7)
+        pooling_mean = pooling_mean.flatten(start_dim=1)
+        pooling_std = pooling_std.flatten(start_dim=1)
+        stats = torch.cat((pooling_mean, pooling_std), 1)
+        return stats
+    def get_out_dim(self):
+        self.out_dim = self.in_dim * 2
+        return self.out_dim
+class ASTP(nn.Module):
+    """ Attentive statistics pooling: Channel- and context-dependent
+        statistics pooling, first used in ECAPA_TDNN.
+    """
+    def __init__(self,
+                 in_dim,
+                 bottleneck_dim=128,
+                 global_context_att=False,
+                 **kwargs):
+        super(ASTP, self).__init__()
+        self.in_dim = in_dim
+        self.global_context_att = global_context_att
+        # Use Conv1d with stride == 1 rather than Linear, then we don't
+        # need to transpose inputs.
+        if global_context_att:
+            self.linear1 = nn.Conv1d(
+                in_dim * 3, bottleneck_dim,
+                kernel_size=1)  # equals W and b in the paper
+        else:
+            self.linear1 = nn.Conv1d(
+                in_dim, bottleneck_dim,
+                kernel_size=1)  # equals W and b in the paper
+        self.linear2 = nn.Conv1d(bottleneck_dim, in_dim,
+                                 kernel_size=1)  # equals V and k in the paper
+    def forward(self, x):
+        """
+        x: a 3-dimensional tensor in tdnn-based architecture (B,F,T)
+            or a 4-dimensional tensor in resnet architecture (B,C,F,T)
+            0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
+        """
+        if len(x.shape) == 4:
+            x = x.reshape(x.shape[0], x.shape[1] * x.shape[2], x.shape[3])
+        assert len(x.shape) == 3
+        if self.global_context_att:
+            context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
+            context_std = torch.sqrt(
+                torch.var(x, dim=-1, keepdim=True) + 1e-7).expand_as(x)
+            x_in = torch.cat((x, context_mean, context_std), dim=1)
+        else:
+            x_in = x
+        # DON'T use ReLU here! ReLU may be hard to converge.
+        alpha = torch.tanh(
+            self.linear1(x_in))  # alpha = F.relu(self.linear1(x_in))
+        alpha = torch.softmax(self.linear2(alpha), dim=2)
+        mean = torch.sum(alpha * x, dim=2)
+        var = torch.sum(alpha * (x**2), dim=2) - mean**2
+        std = torch.sqrt(var.clamp(min=1e-7))
+        return torch.cat([mean, std], dim=1)
+    def get_out_dim(self):
+        self.out_dim = 2 * self.in_dim
+        return self.out_dim
+class MHASTP(torch.nn.Module):
+    """ Multi head attentive statistics pooling
+    Reference:
+        Self Multi-Head Attention for Speaker Recognition
+        https://arxiv.org/pdf/1906.09890.pdf
+    """
+    def __init__(self,
+                 in_dim,
+                 layer_num=2,
+                 head_num=2,
+                 d_s=1,
+                 bottleneck_dim=64,
+                 **kwargs):
+        super(MHASTP, self).__init__()
+        assert (in_dim % head_num
+                ) == 0  # make sure that head num can be divided by input_dim
+        self.in_dim = in_dim
+        self.head_num = head_num
+        d_model = int(in_dim / head_num)
+        channel_dims = [bottleneck_dim for i in range(layer_num + 1)]
+        if d_s > 1:
+            d_s = d_model
+        else:
+            d_s = 1
+        self.d_s = d_s
+        channel_dims[0], channel_dims[-1] = d_model, d_s
+        heads_att_trans = []
+        for i in range(self.head_num):
+            att_trans = nn.Sequential()
+            for i in range(layer_num - 1):
+                att_trans.add_module(
+                    'att_' + str(i),
+                    nn.Conv1d(channel_dims[i], channel_dims[i + 1], 1, 1))
+                att_trans.add_module('tanh' + str(i), nn.Tanh())
+            att_trans.add_module(
+                'att_' + str(layer_num - 1),
+                nn.Conv1d(channel_dims[layer_num - 1], channel_dims[layer_num],
+                          1, 1))
+            heads_att_trans.append(att_trans)
+        self.heads_att_trans = nn.ModuleList(heads_att_trans)
+    def forward(self, input):
+        """
+        input: a 3-dimensional tensor in xvector architecture
+            or a 4-dimensional tensor in resnet architecture
+            0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
+        """
+        if len(input.shape) == 4:  # B x F x T
+            input = input.reshape(input.shape[0],
+                                  input.shape[1] * input.shape[2],
+                                  input.shape[3])
+        assert len(input.shape) == 3
+        bs, f_dim, t_dim = input.shape
+        chunks = torch.chunk(input, self.head_num, 1)
+        # split
+        chunks_out = []
+        # for i in range(self.head_num):
+        #     att_score = self.heads_att_trans[i](chunks[i])
+        for i, layer in enumerate(self.heads_att_trans):
+            att_score = layer(chunks[i])
+            alpha = F.softmax(att_score, dim=-1)
+            mean = torch.sum(alpha * chunks[i], dim=2)
+            var = torch.sum(alpha * chunks[i]**2, dim=2) - mean**2
+            std = torch.sqrt(var.clamp(min=1e-7))
+            chunks_out.append(torch.cat((mean, std), dim=1))
+        out = torch.cat(chunks_out, dim=1)
+        return out
+    def get_out_dim(self):
+        self.out_dim = 2 * self.in_dim
+        return self.out_dim
+class MQMHASTP(torch.nn.Module):
+    """ An attentive pooling
+    Reference:
+        multi query multi head attentive statistics pooling
+        https://arxiv.org/pdf/2110.05042.pdf
+    Args:
+        in_dim: the feature dimension of input
+        layer_num: the number of layer in the pooling layer
+        query_num: the number of querys
+        head_num: the number of heads
+        bottleneck_dim: the bottleneck dimension
+    SA (H = 1, Q = 1, n = 2, d_s = 1) ref:
+        https://www.danielpovey.com/files/2018_interspeech_xvector_attention.pdf
+    MHA (H > 1, Q = 1, n = 1, d_s = 1) ref:
+        https://arxiv.org/pdf/1906.09890.pdf
+    AS (H = 1, Q > 1, n = 2, d_s = 1) ref:
+        https://arxiv.org/pdf/1803.10963.pdf
+    VSA (H = 1, Q > 1, n = 2, d_s = d_h) ref:
+        http://www.interspeech2020.org/uploadfile/pdf/Mon-2-10-5.pdf
+    """
+    def __init__(self,
+                 in_dim,
+                 layer_num=2,
+                 query_num=2,
+                 head_num=8,
+                 d_s=2,
+                 bottleneck_dim=64,
+                 **kwargs):
+        super(MQMHASTP, self).__init__()
+        self.n_query = nn.ModuleList([
+            MHASTP(in_dim,
+                   layer_num=layer_num,
+                   head_num=head_num,
+                   d_s=d_s,
+                   bottleneck_dim=bottleneck_dim) for i in range(query_num)
+        ])
+        self.query_num = query_num
+        self.in_dim = in_dim
+    def forward(self, input):
+        """
+        input: a 3-dimensional tensor in xvector architecture
+            or a 4-dimensional tensor in resnet architecture
+            0-dim: batch-dimension, last-dim: time-dimension (frame-dimension)
+        """
+        if len(input.shape) == 4:  # B x F x T
+            input = input.reshape(input.shape[0],
+                                  input.shape[1] * input.shape[2],
+                                  input.shape[3])
+        assert len(input.shape) == 3
+        res = []
+        for i, layer in enumerate(self.n_query):
+            res.append(layer(input))
+        out = torch.cat(res, dim=-1)
+        return out
+    def get_out_dim(self):
+        self.out_dim = self.in_dim * 2 * self.query_num
+        return self.out_dim
+if __name__ == '__main__':
+    data = torch.randn(16, 512, 10, 35)
+    # model = StatisticsPooling()
+    model = MQMHASTP(512 * 10)
+    model = MHASTP(512 * 10)
+    model = MQMHASTP(512 * 10, context=False)
+    print(model)
+    out = model(data)
+    print(out.shape)
+    print(model.get_out_dim())

sparktts/modules/speaker/speaker_encoder.py ADDED Viewed

	@@ -0,0 +1,136 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from typing import List, Tuple
+from sparktts.modules.fsq.residual_fsq import ResidualFSQ
+from sparktts.modules.speaker.ecapa_tdnn import ECAPA_TDNN_GLOB_c512
+from sparktts.modules.speaker.perceiver_encoder import PerceiverResampler
+"""
+x-vector + d-vector
+"""
+class SpeakerEncoder(nn.Module):
+    """
+    Args:
+        input_dim (int): acoustic feature dimension
+        out_dim (int): output dimension of x-vector and d-vector
+        latent_dim (int): latent dimension before quantization
+        token_num (int): sequence length of speaker tokens
+        fsq_levels (List[int]): number of levels for each quantizer
+        fsq_num_quantizers (int): number of quantizers
+    Return:
+        speaker_embs: (B, T2, out_dim)
+    """
+    def __init__(
+        self,
+        input_dim: int = 100,
+        out_dim: int = 512,
+        latent_dim: int = 128,
+        token_num: int = 32,
+        fsq_levels: List[int] = [4, 4, 4, 4, 4, 4],
+        fsq_num_quantizers: int = 1,
+    ):
+        super(SpeakerEncoder, self).__init__()
+        self.speaker_encoder = ECAPA_TDNN_GLOB_c512(
+            feat_dim=input_dim, embed_dim=out_dim
+        )
+        self.perceiver_sampler = PerceiverResampler(
+            dim=latent_dim, dim_context=512 * 3, num_latents=token_num
+        )
+        self.quantizer = ResidualFSQ(
+            levels=fsq_levels,
+            num_quantizers=fsq_num_quantizers,
+            dim=latent_dim,
+            is_channel_first=True,
+            quantize_dropout=False,
+        )
+        self.project = nn.Linear(latent_dim * token_num, out_dim)
+    def get_codes_from_indices(self, indices: torch.Tensor) -> torch.Tensor:
+        zq = self.quantizer.get_codes_from_indices(indices.transpose(1, 2))
+        return zq.transpose(1, 2)
+    def get_indices(self, mels: torch.Tensor) -> torch.Tensor:
+        mels = mels.transpose(1, 2)
+        x = self.perceiver_sampler(mels).transpose(1, 2)
+        zq, indices = self.quantizer(x)
+        return indices
+    def forward(self, mels: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            mels: (B, D_mel, T1)
+        Return:
+            x_vector: (B, out_dim)
+            d_vector: (B, out_dim)
+        """
+        # mels = mels.transpose(1,2)
+        x_vector, features = self.speaker_encoder(mels, True)
+        x = self.perceiver_sampler(features.transpose(1, 2)).transpose(1, 2)
+        zq, indices = self.quantizer(x)  # zq: (B, latent_dim, T2, latent_dim)
+        x = zq.reshape(zq.shape[0], -1)
+        d_vector = self.project(x)
+        return x_vector, d_vector
+    def tokenize(self, mels: torch.Tensor) -> torch.Tensor:
+        """tokenize the input mel spectrogram"""
+        _, features = self.speaker_encoder(mels, True)
+        x = self.perceiver_sampler(features.transpose(1, 2)).transpose(1, 2)
+        zq, indices = self.quantizer(x)
+        return indices
+    def detokenize(self, indices: torch.Tensor) -> torch.Tensor:
+        """detokenize the input indices to d-vector"""
+        zq = self.quantizer.get_output_from_indices(indices.transpose(1, 2)).transpose(1, 2)
+        x = zq.reshape(zq.shape[0], -1)
+        d_vector = self.project(x)
+        return d_vector
+if __name__ == "__main__":
+    model = SpeakerEncoder(
+        input_dim=100,
+        latent_dim=128,
+        token_num=32,
+        fsq_levels=[4, 4, 4, 4, 4, 4],
+        fsq_num_quantizers=1,
+    )
+    mel = torch.randn(8, 200, 100)
+    x_vector, d_vector = model(mel)
+    print("x-vector shape", x_vector.shape)
+    print("d-vector shape", d_vector.shape)
+    indices = model.tokenize(mel)
+    print("indices shape", indices.shape)
+    d_vector_post = model.detokenize(indices)
+    print("d-vector shape", d_vector_post.shape)
+    if d_vector_post.all() == d_vector.all():
+        print("d-vector post and d-vector are the same")
+    else:
+        print("d-vector post and d-vector are different")
+    num_params = sum(param.numel() for param in model.parameters())
+    print("{} M".format(num_params / 1e6))

sparktts/modules/vq/factorized_vector_quantize.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Heavily based on https://github.com/lucidrains/vector-quantize-pytorch
+from typing import Any, Dict
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def ema_inplace(moving_avg, new, decay):
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+class FactorizedVectorQuantize(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        codebook_size: int,
+        codebook_dim: int,
+        commitment: float,
+        codebook_loss_weight: float = 1.0,
+        decay: float = 0.99,
+        threshold_ema_dead_code: float = 2,
+        momentum: float = 0.99,
+        **kwargs,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.commitment = commitment
+        self.codebook_loss_weight = codebook_loss_weight
+        self.decay = decay
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.momentum = momentum
+        if input_dim != self.codebook_dim:
+            self.in_project = WNConv1d(input_dim, self.codebook_dim, kernel_size=1)
+            self.out_project = WNConv1d(self.codebook_dim, input_dim, kernel_size=1)
+        else:
+            self.in_project = nn.Identity()
+            self.out_project = nn.Identity()
+        self.codebook = nn.Embedding(self.codebook_size, self.codebook_dim)
+        self.register_buffer("cluster_size", torch.zeros(self.codebook_size))
+    def forward(self, z: torch.Tensor) -> Dict[str, Any]:
+        """Quantized the input tensor using a fixed codebook and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        Returns
+        -------
+        Tensor[B x D x T]
+            Quantized continuous representation of input
+        Tensor[1]
+            Commitment loss to train encoder to predict vectors closer to codebook
+            entries
+        Tensor[1]
+            Codebook loss to update the codebook
+        Tensor[B x T]
+            Codebook indices (quantized discrete representation of input)
+        Tensor[B x D x T]
+            Projected latents (continuous representation of input before quantization)
+        """
+        # transpose since we use linear
+        # Factorized codes project input into low-dimensional space if self.input_dim != self.codebook_dim
+        z_e = self.in_project(z)
+        z_q, indices, dists = self.decode_latents(z_e)
+        # statistic the usage of codes
+        embed_onehot = F.one_hot(indices, self.codebook_size).type(z_e.dtype)
+        avg_probs = torch.mean(embed_onehot.reshape(-1, self.codebook_size), dim=0)
+        perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10)))
+        active_num = (embed_onehot.sum(0).sum(0) > 0).sum()
+        if self.training:
+            # We do the expiry of code at that point as buffers are in sync
+            # and all the workers will take the same decision.
+            ema_inplace(self.cluster_size, embed_onehot.sum(0).sum(0), self.decay)
+            active_num = sum(self.cluster_size > self.threshold_ema_dead_code)
+        if self.training:
+            commit_loss = (
+                F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
+                * self.commitment
+            )
+            codebook_loss = (
+                F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
+                * self.codebook_loss_weight
+            )
+        else:
+            commit_loss = torch.zeros(0, device=z.device)
+            codebook_loss = torch.zeros(0, device=z.device)
+        z_q = (
+            z_e + (z_q - z_e).detach()
+        )  # noop in forward pass, straight-through gradient estimator in backward pass
+        z_q = self.out_project(z_q)
+        vq_loss = (commit_loss + codebook_loss).mean()
+        return {
+            "z_q": z_q,
+            "indices": indices,
+            "dists": dists,
+            "vq_loss": vq_loss,
+            "perplexity": perplexity,
+            "active_num": active_num.float(),
+        }
+    def vq2emb(self, vq, out_proj=True):
+        emb = self.embed_code(vq)
+        if out_proj:
+            emb = self.out_project(emb)
+        return emb
+    def tokenize(self, z: torch.Tensor) -> torch.Tensor:
+        """tokenize the input tensor"""
+        z_e = self.in_project(z)
+        _, indices, _ = self.decode_latents(z_e)
+        return indices
+    def detokenize(self, indices):
+        """detokenize the input indices"""
+        z_q = self.decode_code(indices)
+        z_q = self.out_project(z_q)
+        return z_q
+    def get_emb(self):
+        return self.codebook.weight
+    def embed_code(self, embed_id):
+        return F.embedding(embed_id, self.codebook.weight)
+    def decode_code(self, embed_id):
+        return self.embed_code(embed_id).transpose(1, 2)
+    def decode_latents(self, latents):
+        encodings = rearrange(latents, "b d t -> (b t) d")
+        codebook = self.codebook.weight
+        # L2 normalize encodings and codebook
+        encodings = F.normalize(encodings)
+        codebook = F.normalize(codebook)
+        # Compute euclidean distance between encodings and codebook,
+        # with L2 normalization, the distance is equal to cosine distance
+        dist = (
+            encodings.pow(2).sum(1, keepdim=True)
+            - 2 * encodings @ codebook.t()
+            + codebook.pow(2).sum(1, keepdim=True).t()
+        )
+        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
+        z_q = self.decode_code(indices)
+        return z_q, indices, dist

sparktts/utils/__init__.py ADDED Viewed

File without changes

sparktts/utils/audio.py ADDED Viewed

	@@ -0,0 +1,271 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Description:
+    This script contains a collection of functions designed to handle various
+    audio processing.
+"""
+import random
+import soxr
+import soundfile
+import torch
+import torchaudio
+import numpy as np
+from pathlib import Path
+from typing import Tuple
+from numpy.lib.stride_tricks import sliding_window_view
+def audio_volume_normalize(audio: np.ndarray, coeff: float = 0.2) -> np.ndarray:
+    """
+    Normalize the volume of an audio signal.
+    Parameters:
+        audio (numpy array): Input audio signal array.
+        coeff (float): Target coefficient for normalization, default is 0.2.
+    Returns:
+        numpy array: The volume-normalized audio signal.
+    """
+    # Sort the absolute values of the audio signal
+    temp = np.sort(np.abs(audio))
+    # If the maximum value is less than 0.1, scale the array to have a maximum of 0.1
+    if temp[-1] < 0.1:
+        scaling_factor = max(
+            temp[-1], 1e-3
+        )  # Prevent division by zero with a small constant
+        audio = audio / scaling_factor * 0.1
+    # Filter out values less than 0.01 from temp
+    temp = temp[temp > 0.01]
+    L = temp.shape[0]  # Length of the filtered array
+    # If there are fewer than or equal to 10 significant values, return the audio without further processing
+    if L <= 10:
+        return audio
+    # Compute the average of the top 10% to 1% of values in temp
+    volume = np.mean(temp[int(0.9 * L) : int(0.99 * L)])
+    # Normalize the audio to the target coefficient level, clamping the scale factor between 0.1 and 10
+    audio = audio * np.clip(coeff / volume, a_min=0.1, a_max=10)
+    # Ensure the maximum absolute value in the audio does not exceed 1
+    max_value = np.max(np.abs(audio))
+    if max_value > 1:
+        audio = audio / max_value
+    return audio
+def load_audio(
+    adfile: Path,
+    sampling_rate: int = None,
+    length: int = None,
+    volume_normalize: bool = False,
+    segment_duration: int = None,
+) -> np.ndarray:
+    r"""Load audio file with target sampling rate and lsength
+    Args:
+        adfile (Path): path to audio file.
+        sampling_rate (int, optional): target sampling rate. Defaults to None.
+        length (int, optional): target audio length. Defaults to None.
+        volume_normalize (bool, optional): whether perform volume normalization. Defaults to False.
+        segment_duration (int): random select a segment with duration of {segment_duration}s.
+                                Defualt to None which means the whole audio will be used.
+    Returns:
+        audio (np.ndarray): audio
+    """
+    audio, sr = soundfile.read(adfile)
+    if len(audio.shape) > 1:
+        audio = audio[:, 0]
+    if sampling_rate is not None and sr != sampling_rate:
+        audio = soxr.resample(audio, sr, sampling_rate, quality="VHQ")
+        sr = sampling_rate
+    if segment_duration is not None:
+        seg_length = int(sr * segment_duration)
+        audio = random_select_audio_segment(audio, seg_length)
+    # Audio volume normalize
+    if volume_normalize:
+        audio = audio_volume_normalize(audio)
+    # check the audio length
+    if length is not None:
+        assert abs(audio.shape[0] - length) < 1000
+        if audio.shape[0] > length:
+            audio = audio[:length]
+        else:
+            audio = np.pad(audio, (0, int(length - audio.shape[0])))
+    return audio
+def random_select_audio_segment(audio: np.ndarray, length: int) -> np.ndarray:
+    """get an audio segment given the length
+    Args:
+        audio (np.ndarray):
+        length (int): audio length = sampling_rate * duration
+    """
+    if audio.shape[0] < length:
+        audio = np.pad(audio, (0, int(length - audio.shape[0])))
+    start_index = random.randint(0, audio.shape[0] - length)
+    end_index = int(start_index + length)
+    return audio[start_index:end_index]
+def audio_highpass_filter(audio, sample_rate, highpass_cutoff_freq):
+    """apply highpass fileter to audio
+    Args:
+        audio (np.ndarray):
+        sample_rate (ind):
+        highpass_cutoff_freq (int):
+    """
+    audio = torchaudio.functional.highpass_biquad(
+        torch.from_numpy(audio), sample_rate, cutoff_freq=highpass_cutoff_freq
+    )
+    return audio.numpy()
+def stft(
+    x: torch.Tensor,
+    fft_size: int,
+    hop_size: int,
+    win_length: int,
+    window: str,
+    use_complex: bool = False,
+) -> torch.Tensor:
+    """Perform STFT and convert to magnitude spectrogram.
+    Args:
+        x (Tensor): Input signal tensor (B, T).
+        fft_size (int): FFT size.
+        hop_size (int): Hop size.
+        win_length (int): Window length.
+        window (str): Window function type.
+    Returns:
+        Tensor: Magnitude spectrogram (B, #frames, fft_size // 2 + 1).
+    """
+    x_stft = torch.stft(
+        x, fft_size, hop_size, win_length, window.to(x.device), return_complex=True
+    )
+    # clamp is needed to avoid nan or inf
+    if not use_complex:
+        return torch.sqrt(
+            torch.clamp(x_stft.real**2 + x_stft.imag**2, min=1e-7, max=1e3)
+        ).transpose(2, 1)
+    else:
+        res = torch.cat([x_stft.real.unsqueeze(1), x_stft.imag.unsqueeze(1)], dim=1)
+        res = res.transpose(2, 3)  # [B, 2, T, F]
+        return res
+def detect_speech_boundaries(
+    wav: np.ndarray,
+    sample_rate: int,
+    window_duration: float = 0.1,
+    energy_threshold: float = 0.01,
+    margin_factor: int = 2
+) -> Tuple[int, int]:
+    """Detect the start and end points of speech in an audio signal using RMS energy.
+    Args:
+        wav: Input audio signal array with values in [-1, 1]
+        sample_rate: Audio sample rate in Hz
+        window_duration: Duration of detection window in seconds
+        energy_threshold: RMS energy threshold for speech detection
+        margin_factor: Factor to determine extra margin around detected boundaries
+    Returns:
+        tuple: (start_index, end_index) of speech segment
+    Raises:
+        ValueError: If the audio contains only silence
+    """
+    window_size = int(window_duration * sample_rate)
+    margin = margin_factor * window_size
+    step_size = window_size // 10
+    # Create sliding windows using stride tricks to avoid loops
+    windows = sliding_window_view(wav, window_size)[::step_size]
+    # Calculate RMS energy for each window
+    energy = np.sqrt(np.mean(windows ** 2, axis=1))
+    speech_mask = energy >= energy_threshold
+    if not np.any(speech_mask):
+        raise ValueError("No speech detected in audio (only silence)")
+    start = max(0, np.argmax(speech_mask) * step_size - margin)
+    end = min(len(wav), (len(speech_mask) - 1 - np.argmax(speech_mask[::-1])) * step_size + margin)
+    return start, end
+def remove_silence_on_both_ends(
+    wav: np.ndarray,
+    sample_rate: int,
+    window_duration: float = 0.1,
+    volume_threshold: float = 0.01
+) -> np.ndarray:
+    """Remove silence from both ends of an audio signal.
+    Args:
+        wav: Input audio signal array
+        sample_rate: Audio sample rate in Hz
+        window_duration: Duration of detection window in seconds
+        volume_threshold: Amplitude threshold for silence detection
+    Returns:
+        np.ndarray: Audio signal with silence removed from both ends
+    Raises:
+        ValueError: If the audio contains only silence
+    """
+    start, end = detect_speech_boundaries(
+        wav,
+        sample_rate,
+        window_duration,
+        volume_threshold
+    )
+    return wav[start:end]
+def hertz_to_mel(pitch: float) -> float:
+    """
+    Converts a frequency from the Hertz scale to the Mel scale.
+    Parameters:
+    - pitch: float or ndarray
+        Frequency in Hertz.
+    Returns:
+    - mel: float or ndarray
+        Frequency in Mel scale.
+    """
+    mel = 2595 * np.log10(1 + pitch / 700)
+    return mel

sparktts/utils/file.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# Copyright (c) 2025 SparkAudio
+#               2025 Xinsheng Wang ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Description:
+    This script contains a collection of functions designed to handle various
+    file reading and writing operations. It provides utilities to read from files,
+    write data to files, and perform file manipulation tasks.
+"""
+import os
+import json
+import json
+import csv
+from tqdm import tqdm
+from typing import List, Dict, Any, Set, Union
+from pathlib import Path
+from omegaconf import OmegaConf, DictConfig
+def resolve_symbolic_link(symbolic_link_path: Path) -> Path:
+    """
+    Resolves the absolute path of a symbolic link.
+    Args:
+        symbolic_link_path (Path): The path to the symbolic link.
+    Returns:
+        Path: The absolute path that the symbolic link points to.
+    """
+    link_directory = os.path.dirname(symbolic_link_path)
+    target_path_relative = os.readlink(symbolic_link_path)
+    return os.path.join(link_directory, target_path_relative)
+def write_jsonl(metadata: List[dict], file_path: Path) -> None:
+    """Writes a list of dictionaries to a JSONL file.
+    Args:
+    metadata : List[dict]
+        A list of dictionaries, each representing a piece of meta.
+    file_path : Path
+        The file path to save the JSONL file
+    This function writes each dictionary in the list to a new line in the specified file.
+    """
+    with open(file_path, "w", encoding="utf-8") as f:
+        for meta in tqdm(metadata, desc="writing jsonl"):
+            # Convert dictionary to JSON string and write it to the file with a newline
+            json_str = json.dumps(meta, ensure_ascii=False) + "\n"
+            f.write(json_str)
+    print(f"jsonl saved to {file_path}")
+def read_jsonl(file_path: Path) -> List[dict]:
+    """
+    Reads a JSONL file and returns a list of dictionaries.
+    Args:
+    file_path : Path
+        The path to the JSONL file to be read.
+    Returns:
+    List[dict]
+        A list of dictionaries parsed from each line of the JSONL file.
+    """
+    metadata = []
+    # Open the file for reading
+    with open(file_path, "r", encoding="utf-8") as f:
+        # Split the file into lines
+        lines = f.read().splitlines()
+    # Process each line
+    for line in lines:
+        # Convert JSON string back to dictionary and append to list
+        meta = json.loads(line)
+        metadata.append(meta)
+    # Return the list of metadata
+    return metadata
+def read_json_as_jsonl(file_path: Path) -> List[dict]:
+    metadata = []
+    with open(file_path, 'r', encoding='utf-8') as infile:
+        data = json.load(infile)
+    for k in sorted(data.keys()):
+        meta = {'index': k}
+        meta.update(data[k])
+        metadata.append(meta)
+    return metadata
+def decode_unicode_strings(meta: Dict[str, Any]) -> Dict[str, Any]:
+    processed_meta = {}
+    for k, v in meta.items():
+        if isinstance(v, str):
+            processed_meta[k] = v.encode("utf-8").decode("unicode_escape")
+        else:
+            processed_meta[k] = v
+    return processed_meta
+def load_config(config_path: Path) -> DictConfig:
+    """Loads a configuration file and optionally merges it with a base configuration.
+    Args:
+    config_path (Path): Path to the configuration file.
+    """
+    # Load the initial configuration from the given path
+    config = OmegaConf.load(config_path)
+    # Check if there is a base configuration specified and merge if necessary
+    if config.get("base_config", None) is not None:
+        base_config = OmegaConf.load(config["base_config"])
+        config = OmegaConf.merge(base_config, config)
+    return config
+def jsonl_to_csv(jsonl_file_path: str, csv_file_path: str) -> None:
+    """
+    Converts a JSONL file to a CSV file.
+    This function reads a JSONL file, determines all unique keys present in the file,
+    and writes the data to a CSV file with columns for all these keys.
+    """
+    all_keys = set()
+    data_rows = []
+    # Read the JSONL file once to extract keys and collect data
+    with open(jsonl_file_path, 'r') as file:
+        for line in file:
+            data = json.loads(line.strip())
+            data_rows.append(data)
+            all_keys.update(data.keys())
+    # Convert the set of keys to a sorted list for consistent column order
+    sorted_keys = sorted(all_keys)
+    # Write the data to a CSV file
+    with open(csv_file_path, 'w', newline='') as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=sorted_keys)
+        # Write the header row
+        writer.writeheader()
+        # Write each row of data
+        for data in data_rows:
+            writer.writerow(data)
+    print(f"CSV file has been created at {csv_file_path}")
+def save_metadata(data, filename, headers=None):
+    """
+    Save metadata to a file.
+    Args:
+        data (list of dict): Metadata to be saved.
+        filename (str): Name of the file to save the metadata.
+        headers (list of str): The order of column names to be saved; defaults to the keys from the first dictionary in data if not provided.
+    """
+    # Set headers to keys from the first dictionary in data if not explicitly provided
+    if headers is None:
+        headers = list(data[0].keys())
+    with open(filename, "w", encoding="utf-8") as file:
+        # Write the headers to the file
+        file.write("|".join(headers) + "\n")
+        for entry in data:
+            # Retrieve values in the order of headers, replacing any '|' characters with a space to prevent formatting errors
+            formatted_values = [str(entry.get(key, "")).replace("|", " ") for key in headers]
+            # Write the formatted values to the file
+            file.write("|".join(formatted_values) + "\n")
+def read_metadata(filename, headers=None):
+    """
+    Read metadata from a file.
+    Args:
+        filename (str): The file from which to read the metadata.
+    Returns:
+        list of dict: The metadata read from the file.
+        list of str: The headers used in the file.
+    """
+    with open(filename, "r", encoding="utf-8") as file:
+        lines = file.readlines()
+    data = []
+    # Set headers from the first line of the file if not provided
+    if headers is None:
+        headers = lines[0].strip().split("|")
+        lines = lines[1:]
+    for line in lines:
+        line = line.strip()
+        # Skip empty lines
+        if not line:
+            continue
+        # Split the line by '|' and pair with headers to form a dictionary
+        entry_data = dict(zip(headers, line.split("|")))
+        data.append(entry_data)
+    return data, headers

sparktts/utils/parse_options.sh ADDED Viewed

	@@ -0,0 +1,97 @@

+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
+#                 Arnab Ghoshal, Karel Vesely
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the
+# $help_message variable (if defined).
+###
+### The --config file options have lower priority to command line
+### options, so we need to import them first...
+###
+# Now import all the configs specified by command-line, in left-to-right order
+# for ((argpos=1; argpos<$#; argpos++)); do
+#   if [ "${!argpos}" == "--config" ]; then
+#     argpos_plus1=$((argpos+1))
+#     config=${!argpos_plus1}
+#     [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+#     . $config  # source the config file.
+#   fi
+# done
+###
+### No we process the command line options
+###
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    # If the enclosing script is called with --help option, print the help
+    # message and exit.  Scripts should put help messages in $help_message
+    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+      else printf "$help_message\n" 1>&2 ; fi;
+      exit 0 ;;
+    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+      exit 1 ;;
+    # If the first command-line argument begins with "--" (e.g. --foo-bar),
+    # then work out the variable name as $name, which will equal "foo_bar".
+    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
+      # Next we test whether the variable in question is undefned-- if so it's
+      # an invalid option and we die.  Note: $0 evaluates to the name of the
+      # enclosing script.
+      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+      # is undefined.  We then have to wrap this test inside "eval" because
+      # foo_bar is itself inside a variable ($name).
+      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      oldval="`eval echo \\$$name`";
+      # Work out whether we seem to be expecting a Boolean argument.
+      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval $name=\"$2\";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+  *) break;
+  esac
+done
+# Check for an empty argument to the --cmd option, which can easily occur as a
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+true; # so this script returns exit code 0.

sparktts/utils/token_parser.py ADDED Viewed

	@@ -0,0 +1,187 @@

+TASK_TOKEN_MAP = {
+    "vc": "<|task_vc|>",
+    "tts": "<|task_tts|>",
+    "asr": "<|task_asr|>",
+    "s2s": "<|task_s2s|>",
+    "t2s": "<|task_t2s|>",
+    "understand": "<|task_understand|>",
+    "caption": "<|task_cap|>",
+    "controllable_tts": "<|task_controllable_tts|>",
+    "prompt_tts": "<|task_prompt_tts|>",
+    "speech_edit": "<|task_edit|>",
+}
+LEVELS_MAP = {
+    "very_low": 0,
+    "low": 1,
+    "moderate": 2,
+    "high": 3,
+    "very_high": 4,
+}
+LEVELS_MAP_UI = {
+    1: 'very_low',
+    2: 'low',
+    3: 'moderate',
+    4: 'high',
+    5: 'very_high'
+}
+GENDER_MAP = {
+    "female": 0,
+    "male": 1,
+}
+AGE_MAP = {"Child": 0, "Teenager": 1, "Youth-Adult": 2, "Middle-aged": 3, "Elderly": 4}
+EMO_MAP = {
+    "UNKNOWN": 0,
+    "NEUTRAL": 1,
+    "ANGRY": 2,
+    "HAPPY": 3,
+    "SAD": 4,
+    "FEARFUL": 5,
+    "DISGUSTED": 6,
+    "SURPRISED": 7,
+    "SARCASTIC": 8,
+    "EXCITED": 9,
+    "SLEEPY": 10,
+    "CONFUSED": 11,
+    "EMPHASIS": 12,
+    "LAUGHING": 13,
+    "SINGING": 14,
+    "WORRIED": 15,
+    "WHISPER": 16,
+    "ANXIOUS": 17,
+    "NO-AGREEMENT": 18,
+    "APOLOGETIC": 19,
+    "CONCERNED": 20,
+    "ENUNCIATED": 21,
+    "ASSERTIVE": 22,
+    "ENCOURAGING": 23,
+    "CONTEMPT": 24,
+}
+class TokenParser:
+    """Turn label to special token"""
+    def __init__(self):
+        pass
+    """Parse the attributes of a person."""
+    def __init__(self):
+        pass
+    @staticmethod
+    def age(age: str) -> str:
+        """Turn age token."""
+        age_id = AGE_MAP[age]
+        return f"<|age_{age_id}|>"
+    @staticmethod
+    def gender(gender: str) -> str:
+        """Turn gender token."""
+        gender_id = GENDER_MAP[gender]
+        return f"<|gender_{gender_id}|>"
+    @staticmethod
+    def mel_value(mel: int):
+        """Turn special token of mel scale pitch."""
+        mel = max(0, int(mel))
+        mel = min(1000, int(mel))
+        return f"<|pitch_value_{mel}|>"
+    @staticmethod
+    def mel_level(level: str):
+        """Turn special token of mel level."""
+        level_tag = LEVELS_MAP[level]
+        return f"<|pitch_label_{level_tag}|>"
+    @staticmethod
+    def pitch_var_value(pitch_std: int):
+        """Turn special token of pitch_std value."""
+        assert isinstance(pitch_std, int)
+        pitch_std = max(0, int(pitch_std))
+        pitch_std = min(10, int(pitch_std))
+        return f"<|pitch_var_value_{pitch_std}|>"
+    @staticmethod
+    def pitch_var_level(level: str):
+        """Turn special token of pitch std level."""
+        level_tag = LEVELS_MAP[level]
+        return f"<|pitch_var_label_{level_tag}|>"
+    @staticmethod
+    def loudness_value(loudness: int):
+        """Turn special toak of loudness value [0, 30]"""
+        assert loudness >= 0
+        loudness = max(0, int(loudness))
+        loudness = min(30, int(loudness))
+        return f"<|loudness_value_{loudness}|>"
+    @staticmethod
+    def loudness_level(level: str):
+        """Turn special token of loudness level."""
+        level_tag = LEVELS_MAP[level]
+        return f"<|loudness_label_{level_tag}|>"
+    @staticmethod
+    def speed_value(speed: int):
+        """Turn special token of speed value."""
+        speed = max(0, int(speed))
+        speed = min(10, int(speed))
+        return f"<|speed_value_{speed}|>"
+    @staticmethod
+    def speed_level(level: str):
+        """Turn special token of speed level."""
+        level_tag = LEVELS_MAP[level]
+        return f"<|speed_label_{level_tag}|>"
+    @staticmethod
+    def task(task: str) -> str:
+        """Turn special token of task."""
+        assert task in TASK_TOKEN_MAP.keys()
+        return TASK_TOKEN_MAP[task]
+    @staticmethod
+    def emotion(emotion: str):
+        emo_id = EMO_MAP[emotion]
+        return f"<|emotion_{emo_id}|>"
+# test
+if __name__ == "__main__":
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        "/aifs4su/xinshengwang/code/StyleCraft/tokenizer/stylecraft-bicodec-pitch-loudness-speed-emotion-tokenizer"
+    )
+    tasks = ["tts", "tts", "understand", "controllable_tts", "prompt_tts"]
+    ages = ["Child", "Teenager", "Youth-Adult", "Middle-aged", "Elderly"]
+    genders = ["female", "female", "female", "male", "male"]
+    mels = [100, 200, 300, 400, 500]
+    mel_levels = ["very_low", "low", "moderate", "high", "very_high"]
+    loudnesses = [1, 10, 23, 19, 30]
+    loudness_levels = ["very_low", "low", "moderate", "high", "very_high"]
+    emotions = ["UNKNOWN", "NEUTRAL", "ANGRY", "HAPPY", "SAD"]
+    for i in range(5):
+        task = TokenParser.task(tasks[i])
+        age = TokenParser.age(ages[i])
+        gender = TokenParser.gender(genders[i])
+        mel = TokenParser.mel_value(mels[i])
+        mel_level = TokenParser.mel_level(mel_levels[i])
+        loudness = TokenParser.loudness_value(loudnesses[i])
+        loudness_level = TokenParser.loudness_level(loudness_levels[i])
+        emotion = TokenParser.emotion(emotions[i])
+        inputs = [task, age, gender, mel, mel_level, loudness, loudness_level, emotion]
+        inputs = "".join(inputs)
+        ids = tokenizer.encode(inputs, add_special_tokens=False)
+        print(ids)
+        print("decode", tokenizer.decode(ids))

src/demos/trump/trump_en.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64fe26aa5fb432be85a183b6e48f7e1045c5c9fd4b8eb4faeeb5d4df5934f80f
+size 476204

src/demos/zhongli/zhongli_en.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b94df88681b9f8afe07d2081bcc354f8e97c83886a944f37b4979b83547d39f
+size 389804

src/demos/余承东/yuchengdong_zh.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90bcbe608c5c483d34213e18b7778e11ab55998d52bbe1b5f8bdd80f0473e7c2
+size 496044

src/demos/刘德华/dehua_zh.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe97bd7c679be4dfbbd30496bf9b192c43dfae4a497ae7d99e85841ea06e77f2
+size 772524

src/demos/哪吒/nezha_zh.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e09b01a61e6965f42b2bbfd47a33730991276ad7e6892531ba844646f8c9601e
+size 596524