#!/usr/bin/env python3 import sys sys.path.append('third_party/Matcha-TTS') print("=== CosyVoice TTS Final Test ===\n") try: from cosyvoice.cli.cosyvoice import CosyVoice from cosyvoice.utils.file_utils import load_wav import torchaudio import os # Use the complete model model_path = 'pretrained_models/CosyVoice-300M-direct' print(f"Loading model from: {model_path}") cosyvoice = CosyVoice(model_path, load_jit=False, load_trt=False, fp16=False) print("✓ Model loaded successfully!") # Load prompt audio prompt_speech_16k = load_wav('asset/zero_shot_prompt.wav', 16000) print("✓ Prompt audio loaded") # Test different languages tests = [ { 'lang': 'Portuguese', 'text': 'Olá! Bem-vindo ao teste de síntese de voz do CosyVoice. Este sistema converte texto em fala de forma natural.', 'prompt': 'Teste de voz em português.', 'output': 'output_portuguese.wav' }, { 'lang': 'English', 'text': 'Hello! This is a test of the CosyVoice text-to-speech system. It can generate natural sounding speech.', 'prompt': 'Testing English speech synthesis.', 'output': 'output_english.wav' }, { 'lang': 'Chinese', 'text': '你好!这是CosyVoice语音合成系统的测试。它可以生成自然的语音。', 'prompt': '测试中文语音合成。', 'output': 'output_chinese.wav' } ] for test in tests: print(f"\n{'='*50}") print(f"Testing {test['lang']}:") print(f"Text: {test['text']}") print(f"Generating speech...") # Generate speech for i, j in enumerate(cosyvoice.inference_zero_shot( test['text'], test['prompt'], prompt_speech_16k, stream=False )): # Save audio torchaudio.save(test['output'], j['tts_speech'], cosyvoice.sample_rate) # Show info duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate size_kb = os.path.getsize(test['output']) / 1024 print(f"✓ Generated: {test['output']}") print(f" Duration: {duration:.2f} seconds") print(f" Size: {size_kb:.1f} KB") print(f" Sample rate: {cosyvoice.sample_rate} Hz") # Test streaming mode print(f"\n{'='*50}") print("Testing streaming mode:") text = "Este é um teste do modo streaming, que permite gerar áudio em tempo real com baixa latência." print(f"Text: {text}") chunk_count = 0 total_duration = 0 for i, j in enumerate(cosyvoice.inference_zero_shot(text, "Modo streaming.", prompt_speech_16k, stream=True)): chunk_count += 1 chunk_duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate total_duration += chunk_duration print(f" Chunk {chunk_count}: {chunk_duration:.3f}s") # Save first chunk as example if chunk_count == 1: torchaudio.save('output_streaming_chunk1.wav', j['tts_speech'], cosyvoice.sample_rate) print(f"✓ Generated {chunk_count} chunks, total duration: {total_duration:.2f}s") print(f"\n{'='*50}") print("✅ All tests completed successfully!") # Summary print("\n📁 Generated files:") for f in sorted(os.listdir('.')): if f.startswith('output_') and f.endswith('.wav'): size = os.path.getsize(f) / 1024 print(f" - {f} ({size:.1f} KB)") print("\n💡 To download files to your local machine:") print("scp -P 40053 -i ~/.ssh/id_ed25519 root@213.192.2.74:~/CosyVoice/output_*.wav .") except Exception as e: print(f"\n❌ Error: {e}") import traceback traceback.print_exc()