Spaces:
Running
Running
import time | |
import yaml | |
from pathlib import Path | |
import torch | |
from fish_audio.sdk import TextToSpeech, Vocoder | |
from pydub import AudioSegment | |
# Load config | |
config_path = Path(__file__).parent.parent / "config" / "tts_config.yaml" | |
with open(config_path) as f: | |
config = yaml.safe_load(f) | |
# Initialize models | |
tts_model = TextToSpeech( | |
model_name=config["tts"]["model"], | |
device=config["tts"]["device"], | |
precision=config["tts"]["precision"], | |
) | |
vocoder = Vocoder( | |
model_name=config["tts"]["vocoder"], | |
device=tts_model.device, | |
) | |
def generate_speech(text: str, language: str = "zh") -> str: | |
"""Generate speech from text using Fish Audio SDK""" | |
# Format text with language tags | |
lang_template = config["generation"]["language_mapping"][language] | |
processed_text = lang_template.format(text=text) | |
# Generate mel spectrogram | |
mel = tts_model.generate( | |
text=processed_text, | |
temperature=config["generation"]["temperature"], | |
top_k=config["generation"]["top_k"], | |
max_length=config["generation"]["max_length"], | |
) | |
# Convert mel to waveform | |
waveform = vocoder(mel) | |
# Create audio segment | |
audio = AudioSegment( | |
waveform.numpy().tobytes(), | |
frame_rate=vocoder.sample_rate, | |
sample_width=2, | |
channels=1, | |
) | |
# Save output | |
output_path = f"temp/outputs/output_{int(time.time())}.wav" | |
audio.export(output_path, format="wav") | |
return output_path |