File size: 5,135 Bytes
7aa1435 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
# Copyright (c) 2025 SparkAudio
# 2025 Xinsheng Wang ([email protected])
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import torch
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM
from utils.file import load_config
from models.audio_tokenizer import BiCodecTokenizer
from utils.token_parser import TASK_TOKEN_MAP
class SparkTTS:
"""
Spark-TTS for text-to-speech generation.
"""
def __init__(self, model_dir: Path, device: torch.device = torch.device("cuda:0")):
"""
Initializes the SparkTTS model with the provided configurations and device.
Args:
model_dir (Path): Directory containing the model and config files.
device (torch.device): The device (CPU/GPU) to run the model on.
"""
self.device = device
self.model_dir = model_dir
self.configs = load_config(f"{model_dir}/config.yaml")
self.sample_rate = self.configs["sample_rate"]
self._initialize_inference()
def _initialize_inference(self):
"""Initializes the tokenizer, model, and audio tokenizer for inference."""
self.tokenizer = AutoTokenizer.from_pretrained(f"{self.model_dir}/LLM")
self.model = AutoModelForCausalLM.from_pretrained(f"{self.model_dir}/LLM")
self.audio_tokenizer = BiCodecTokenizer(self.model_dir, device=self.device)
self.model.to(self.device)
@torch.no_grad()
def inference(
self,
text: str,
prompt_speech_path: Path,
prompt_text: str = None,
temperature: float = 0.8,
top_k: float = 50,
top_p: float = 0.95,
) -> torch.Tensor:
"""
Performs inference to generate speech from text, incorporating prompt audio and/or text.
Args:
text (str): The text input to be converted to speech.
prompt_speech_path (Path): Path to the audio file used as a prompt.
prompt_text (str, optional): Transcript of the prompt audio.
temperature (float, optional): Sampling temperature for controlling randomness. Default is 0.8.
top_k (float, optional): Top-k sampling parameter. Default is 50.
top_p (float, optional): Top-p (nucleus) sampling parameter. Default is 0.95.
Returns:
torch.Tensor: Generated waveform as a tensor.
"""
global_token_ids, semantic_token_ids = self.audio_tokenizer.tokenize(prompt_speech_path)
global_tokens = "".join([f"<|bicodec_global_{i}|>" for i in global_token_ids.squeeze()])
# Prepare the input tokens for the model
if prompt_text is not None:
semantic_tokens = "".join([f"<|bicodec_semantic_{i}|>" for i in semantic_token_ids.squeeze()])
inputs = [
TASK_TOKEN_MAP["tts"],
"<|start_content|>",
prompt_text,
text,
"<|end_content|>",
"<|start_global_token|>",
global_tokens,
"<|end_global_token|>",
"<|start_semantic_token|>",
semantic_tokens,
]
else:
inputs = [
TASK_TOKEN_MAP["tts"],
"<|start_content|>",
text,
"<|end_content|>",
"<|start_global_token|>",
global_tokens,
"<|end_global_token|>",
]
inputs = "".join(inputs)
model_inputs = self.tokenizer([inputs], return_tensors="pt").to(self.device)
# Generate speech using the model
generated_ids = self.model.generate(
**model_inputs,
max_new_tokens=3000,
do_sample=True,
top_k=top_k,
top_p=top_p,
temperature=temperature,
)
# Trim the output tokens to remove the input tokens
generated_ids = [
output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
# Decode the generated tokens into text
predicts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
# Extract semantic token IDs from the generated text
pred_semantic_ids = torch.tensor([int(token) for token in re.findall(r"\d+", predicts)]).long().unsqueeze(0)
# Convert semantic tokens back to waveform
wav = self.audio_tokenizer.detokenize(
global_token_ids.to(self.device).squeeze(0),
pred_semantic_ids.to(self.device),
)
return wav
|