import time import yaml from pathlib import Path import torch from fish_audio.sdk import TextToSpeech, Vocoder from pydub import AudioSegment # Load config config_path = Path(__file__).parent.parent / "config" / "tts_config.yaml" with open(config_path) as f: config = yaml.safe_load(f) # Initialize models tts_model = TextToSpeech( model_name=config["tts"]["model"], device=config["tts"]["device"], precision=config["tts"]["precision"], ) vocoder = Vocoder( model_name=config["tts"]["vocoder"], device=tts_model.device, ) def generate_speech(text: str, language: str = "zh") -> str: """Generate speech from text using Fish Audio SDK""" # Format text with language tags lang_template = config["generation"]["language_mapping"][language] processed_text = lang_template.format(text=text) # Generate mel spectrogram mel = tts_model.generate( text=processed_text, temperature=config["generation"]["temperature"], top_k=config["generation"]["top_k"], max_length=config["generation"]["max_length"], ) # Convert mel to waveform waveform = vocoder(mel) # Create audio segment audio = AudioSegment( waveform.numpy().tobytes(), frame_rate=vocoder.sample_rate, sample_width=2, channels=1, ) # Save output output_path = f"temp/outputs/output_{int(time.time())}.wav" audio.export(output_path, format="wav") return output_path