cosyvoice / test_tts_cosyvoice.py
Marcos Remar
Initial CosyVoice code without binary files
0ea7b2a
#!/usr/bin/env python3
import sys
import os
import time
print("=== CosyVoice Text-to-Speech Test ===\n")
# Add Matcha-TTS to path
sys.path.append('third_party/Matcha-TTS')
# Wait for model files
required_files = [
'pretrained_models/CosyVoice-300M/flow.pt',
'pretrained_models/CosyVoice-300M/speech_tokenizer_v1.onnx',
'pretrained_models/CosyVoice-300M/campplus.onnx'
]
print("Checking for required model files...")
max_wait = 300 # 5 minutes
start_time = time.time()
while True:
missing_files = [f for f in required_files if not os.path.exists(f)]
if not missing_files:
print("All required files found!")
break
elapsed = time.time() - start_time
if elapsed > max_wait:
print(f"Timeout waiting for files. Missing: {missing_files}")
sys.exit(1)
print(f"Waiting for downloads to complete... ({int(elapsed)}s elapsed)")
time.sleep(10)
try:
from cosyvoice.cli.cosyvoice import CosyVoice
from cosyvoice.utils.file_utils import load_wav
import torchaudio
print("\n1. Testing Zero-Shot Voice Cloning")
print("-" * 40)
# Initialize model
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M', load_jit=False, load_trt=False, fp16=False)
# Test 1: Zero-shot with provided prompt audio
if os.path.exists('asset/zero_shot_prompt.wav'):
prompt_speech_16k = load_wav('asset/zero_shot_prompt.wav', 16000)
text_pt = "Olá, este é um teste de síntese de voz em português brasileiro."
prompt_pt = "Testando a conversão de texto para fala."
print(f"Generating Portuguese speech: '{text_pt}'")
for i, j in enumerate(cosyvoice.inference_zero_shot(text_pt, prompt_pt, prompt_speech_16k, stream=False)):
output_file = f'output_portuguese_{i}.wav'
torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate)
print(f"✓ Saved: {output_file}")
# Test 2: Chinese speech
text_zh = "你好,我是通义生成式语音大模型,很高兴为您服务。"
prompt_zh = "希望这个测试能够成功。"
print(f"\nGenerating Chinese speech: '{text_zh}'")
if os.path.exists('asset/zero_shot_prompt.wav'):
for i, j in enumerate(cosyvoice.inference_zero_shot(text_zh, prompt_zh, prompt_speech_16k, stream=False)):
output_file = f'output_chinese_{i}.wav'
torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate)
print(f"✓ Saved: {output_file}")
# Test 3: English speech
text_en = "Hello, this is a test of the CosyVoice text-to-speech system. It supports multiple languages."
prompt_en = "Testing voice synthesis."
print(f"\nGenerating English speech: '{text_en}'")
if os.path.exists('asset/zero_shot_prompt.wav'):
for i, j in enumerate(cosyvoice.inference_zero_shot(text_en, prompt_en, prompt_speech_16k, stream=False)):
output_file = f'output_english_{i}.wav'
torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate)
print(f"✓ Saved: {output_file}")
print("\n2. Testing SFT Mode (if available)")
print("-" * 40)
# Try SFT model if available
if os.path.exists('pretrained_models/CosyVoice-300M-SFT/cosyvoice.yaml'):
try:
cosyvoice_sft = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False)
# List available speakers
speakers = cosyvoice_sft.list_available_spks()
print(f"Available speakers: {speakers[:5]}...") # Show first 5
# Generate with first available speaker
if speakers:
text = "Este é um teste usando o modelo SFT com falantes pré-definidos."
speaker = speakers[0]
print(f"\nGenerating with speaker '{speaker}': '{text}'")
for i, j in enumerate(cosyvoice_sft.inference_sft(text, speaker, stream=False)):
output_file = f'output_sft_{i}.wav'
torchaudio.save(output_file, j['tts_speech'], cosyvoice_sft.sample_rate)
print(f"✓ Saved: {output_file}")
except Exception as e:
print(f"SFT model not ready yet: {e}")
print("\n=== Test completed successfully! ===")
print("\nGenerated audio files:")
for f in os.listdir('.'):
if f.startswith('output_') and f.endswith('.wav'):
size = os.path.getsize(f) / 1024
print(f" - {f} ({size:.1f} KB)")
except Exception as e:
print(f"\nError during test: {e}")
import traceback
traceback.print_exc()