cosyvoice / final_tts_test.py
Marcos Remar
Initial CosyVoice code without binary files
0ea7b2a
#!/usr/bin/env python3
import sys
sys.path.append('third_party/Matcha-TTS')
print("=== CosyVoice TTS Final Test ===\n")
try:
from cosyvoice.cli.cosyvoice import CosyVoice
from cosyvoice.utils.file_utils import load_wav
import torchaudio
import os
# Use the complete model
model_path = 'pretrained_models/CosyVoice-300M-direct'
print(f"Loading model from: {model_path}")
cosyvoice = CosyVoice(model_path, load_jit=False, load_trt=False, fp16=False)
print("✓ Model loaded successfully!")
# Load prompt audio
prompt_speech_16k = load_wav('asset/zero_shot_prompt.wav', 16000)
print("✓ Prompt audio loaded")
# Test different languages
tests = [
{
'lang': 'Portuguese',
'text': 'Olá! Bem-vindo ao teste de síntese de voz do CosyVoice. Este sistema converte texto em fala de forma natural.',
'prompt': 'Teste de voz em português.',
'output': 'output_portuguese.wav'
},
{
'lang': 'English',
'text': 'Hello! This is a test of the CosyVoice text-to-speech system. It can generate natural sounding speech.',
'prompt': 'Testing English speech synthesis.',
'output': 'output_english.wav'
},
{
'lang': 'Chinese',
'text': '你好!这是CosyVoice语音合成系统的测试。它可以生成自然的语音。',
'prompt': '测试中文语音合成。',
'output': 'output_chinese.wav'
}
]
for test in tests:
print(f"\n{'='*50}")
print(f"Testing {test['lang']}:")
print(f"Text: {test['text']}")
print(f"Generating speech...")
# Generate speech
for i, j in enumerate(cosyvoice.inference_zero_shot(
test['text'],
test['prompt'],
prompt_speech_16k,
stream=False
)):
# Save audio
torchaudio.save(test['output'], j['tts_speech'], cosyvoice.sample_rate)
# Show info
duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
size_kb = os.path.getsize(test['output']) / 1024
print(f"✓ Generated: {test['output']}")
print(f" Duration: {duration:.2f} seconds")
print(f" Size: {size_kb:.1f} KB")
print(f" Sample rate: {cosyvoice.sample_rate} Hz")
# Test streaming mode
print(f"\n{'='*50}")
print("Testing streaming mode:")
text = "Este é um teste do modo streaming, que permite gerar áudio em tempo real com baixa latência."
print(f"Text: {text}")
chunk_count = 0
total_duration = 0
for i, j in enumerate(cosyvoice.inference_zero_shot(text, "Modo streaming.", prompt_speech_16k, stream=True)):
chunk_count += 1
chunk_duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
total_duration += chunk_duration
print(f" Chunk {chunk_count}: {chunk_duration:.3f}s")
# Save first chunk as example
if chunk_count == 1:
torchaudio.save('output_streaming_chunk1.wav', j['tts_speech'], cosyvoice.sample_rate)
print(f"✓ Generated {chunk_count} chunks, total duration: {total_duration:.2f}s")
print(f"\n{'='*50}")
print("✅ All tests completed successfully!")
# Summary
print("\n📁 Generated files:")
for f in sorted(os.listdir('.')):
if f.startswith('output_') and f.endswith('.wav'):
size = os.path.getsize(f) / 1024
print(f" - {f} ({size:.1f} KB)")
print("\n💡 To download files to your local machine:")
print("scp -P 40053 -i ~/.ssh/id_ed25519 [email protected]:~/CosyVoice/output_*.wav .")
except Exception as e:
print(f"\n❌ Error: {e}")
import traceback
traceback.print_exc()