Spaces:
Configuration error
Configuration error
#!/usr/bin/env python3 | |
import sys | |
sys.path.append('third_party/Matcha-TTS') | |
print("=== CosyVoice TTS Final Test ===\n") | |
try: | |
from cosyvoice.cli.cosyvoice import CosyVoice | |
from cosyvoice.utils.file_utils import load_wav | |
import torchaudio | |
import os | |
# Use the complete model | |
model_path = 'pretrained_models/CosyVoice-300M-direct' | |
print(f"Loading model from: {model_path}") | |
cosyvoice = CosyVoice(model_path, load_jit=False, load_trt=False, fp16=False) | |
print("✓ Model loaded successfully!") | |
# Load prompt audio | |
prompt_speech_16k = load_wav('asset/zero_shot_prompt.wav', 16000) | |
print("✓ Prompt audio loaded") | |
# Test different languages | |
tests = [ | |
{ | |
'lang': 'Portuguese', | |
'text': 'Olá! Bem-vindo ao teste de síntese de voz do CosyVoice. Este sistema converte texto em fala de forma natural.', | |
'prompt': 'Teste de voz em português.', | |
'output': 'output_portuguese.wav' | |
}, | |
{ | |
'lang': 'English', | |
'text': 'Hello! This is a test of the CosyVoice text-to-speech system. It can generate natural sounding speech.', | |
'prompt': 'Testing English speech synthesis.', | |
'output': 'output_english.wav' | |
}, | |
{ | |
'lang': 'Chinese', | |
'text': '你好!这是CosyVoice语音合成系统的测试。它可以生成自然的语音。', | |
'prompt': '测试中文语音合成。', | |
'output': 'output_chinese.wav' | |
} | |
] | |
for test in tests: | |
print(f"\n{'='*50}") | |
print(f"Testing {test['lang']}:") | |
print(f"Text: {test['text']}") | |
print(f"Generating speech...") | |
# Generate speech | |
for i, j in enumerate(cosyvoice.inference_zero_shot( | |
test['text'], | |
test['prompt'], | |
prompt_speech_16k, | |
stream=False | |
)): | |
# Save audio | |
torchaudio.save(test['output'], j['tts_speech'], cosyvoice.sample_rate) | |
# Show info | |
duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate | |
size_kb = os.path.getsize(test['output']) / 1024 | |
print(f"✓ Generated: {test['output']}") | |
print(f" Duration: {duration:.2f} seconds") | |
print(f" Size: {size_kb:.1f} KB") | |
print(f" Sample rate: {cosyvoice.sample_rate} Hz") | |
# Test streaming mode | |
print(f"\n{'='*50}") | |
print("Testing streaming mode:") | |
text = "Este é um teste do modo streaming, que permite gerar áudio em tempo real com baixa latência." | |
print(f"Text: {text}") | |
chunk_count = 0 | |
total_duration = 0 | |
for i, j in enumerate(cosyvoice.inference_zero_shot(text, "Modo streaming.", prompt_speech_16k, stream=True)): | |
chunk_count += 1 | |
chunk_duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate | |
total_duration += chunk_duration | |
print(f" Chunk {chunk_count}: {chunk_duration:.3f}s") | |
# Save first chunk as example | |
if chunk_count == 1: | |
torchaudio.save('output_streaming_chunk1.wav', j['tts_speech'], cosyvoice.sample_rate) | |
print(f"✓ Generated {chunk_count} chunks, total duration: {total_duration:.2f}s") | |
print(f"\n{'='*50}") | |
print("✅ All tests completed successfully!") | |
# Summary | |
print("\n📁 Generated files:") | |
for f in sorted(os.listdir('.')): | |
if f.startswith('output_') and f.endswith('.wav'): | |
size = os.path.getsize(f) / 1024 | |
print(f" - {f} ({size:.1f} KB)") | |
print("\n💡 To download files to your local machine:") | |
print("scp -P 40053 -i ~/.ssh/id_ed25519 [email protected]:~/CosyVoice/output_*.wav .") | |
except Exception as e: | |
print(f"\n❌ Error: {e}") | |
import traceback | |
traceback.print_exc() |