Spaces:
Running
Running
File size: 1,492 Bytes
cd1309d 933cc7f cd1309d 933cc7f cd1309d 933cc7f cd1309d 933cc7f cd1309d 933cc7f cd1309d 933cc7f cd1309d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import time
import yaml
from pathlib import Path
import torch
from fish_audio.sdk import TextToSpeech, Vocoder
from pydub import AudioSegment
# Load config
config_path = Path(__file__).parent.parent / "config" / "tts_config.yaml"
with open(config_path) as f:
config = yaml.safe_load(f)
# Initialize models
tts_model = TextToSpeech(
model_name=config["tts"]["model"],
device=config["tts"]["device"],
precision=config["tts"]["precision"],
)
vocoder = Vocoder(
model_name=config["tts"]["vocoder"],
device=tts_model.device,
)
def generate_speech(text: str, language: str = "zh") -> str:
"""Generate speech from text using Fish Audio SDK"""
# Format text with language tags
lang_template = config["generation"]["language_mapping"][language]
processed_text = lang_template.format(text=text)
# Generate mel spectrogram
mel = tts_model.generate(
text=processed_text,
temperature=config["generation"]["temperature"],
top_k=config["generation"]["top_k"],
max_length=config["generation"]["max_length"],
)
# Convert mel to waveform
waveform = vocoder(mel)
# Create audio segment
audio = AudioSegment(
waveform.numpy().tobytes(),
frame_rate=vocoder.sample_rate,
sample_width=2,
channels=1,
)
# Save output
output_path = f"temp/outputs/output_{int(time.time())}.wav"
audio.export(output_path, format="wav")
return output_path |