#!/usr/bin/env python3 import sys import os print("=== CosyVoice2 TTS Test ===\n") # Add path sys.path.append('third_party/Matcha-TTS') try: # First, let's download CosyVoice2-0.5B print("Downloading CosyVoice2-0.5B model...") from modelscope import snapshot_download model_dir = 'pretrained_models/CosyVoice2-0.5B' if not os.path.exists(model_dir) or len(os.listdir(model_dir)) < 5: snapshot_download('iic/CosyVoice2-0.5B', local_dir=model_dir) print("Model downloaded!") else: print("Model already exists!") # Import CosyVoice2 from cosyvoice.cli.cosyvoice import CosyVoice2 from cosyvoice.utils.file_utils import load_wav import torchaudio print("\nInitializing CosyVoice2 model...") cosyvoice = CosyVoice2(model_dir, load_jit=False, load_trt=False, load_vllm=False, fp16=False) # Load prompt audio prompt_speech_16k = load_wav('asset/zero_shot_prompt.wav', 16000) # Test 1: Portuguese text_pt = "Olá! Este é um teste do CosyVoice dois. Ele suporta múltiplos idiomas incluindo português." prompt_pt = "Testando síntese de voz." print(f"\nGenerating Portuguese speech: '{text_pt}'") for i, j in enumerate(cosyvoice.inference_zero_shot(text_pt, prompt_pt, prompt_speech_16k, stream=False)): output_file = f'cosyvoice2_pt_{i}.wav' torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate) size_kb = os.path.getsize(output_file) / 1024 duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate print(f"✓ Saved: {output_file} ({size_kb:.1f} KB, {duration:.1f}s)") # Test 2: English text_en = "Hello! This is CosyVoice two point zero. It has improved accuracy and lower latency." print(f"\nGenerating English speech: '{text_en}'") for i, j in enumerate(cosyvoice.inference_zero_shot(text_en, "Testing voice synthesis.", prompt_speech_16k, stream=False)): output_file = f'cosyvoice2_en_{i}.wav' torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate) size_kb = os.path.getsize(output_file) / 1024 duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate print(f"✓ Saved: {output_file} ({size_kb:.1f} KB, {duration:.1f}s)") # Test 3: Chinese text_zh = "你好!这是通义语音大模型第二代,支持超低延迟的流式合成。" print(f"\nGenerating Chinese speech: '{text_zh}'") for i, j in enumerate(cosyvoice.inference_zero_shot(text_zh, "测试语音合成。", prompt_speech_16k, stream=False)): output_file = f'cosyvoice2_zh_{i}.wav' torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate) size_kb = os.path.getsize(output_file) / 1024 duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate print(f"✓ Saved: {output_file} ({size_kb:.1f} KB, {duration:.1f}s)") # Test 4: Streaming mode print(f"\nTesting streaming mode...") text_stream = "Este é um teste do modo de streaming, que permite síntese em tempo real com latência muito baixa." print("Generating chunks:") chunk_count = 0 for i, j in enumerate(cosyvoice.inference_zero_shot(text_stream, prompt_pt, prompt_speech_16k, stream=True)): chunk_count += 1 print(f" Chunk {chunk_count}: {j['tts_speech'].shape[1]/cosyvoice.sample_rate:.2f}s") print(f"\n✓ CosyVoice2 test completed successfully!") print(f"Generated {chunk_count} streaming chunks") # List generated files print("\nGenerated files:") for f in sorted(os.listdir('.')): if f.startswith('cosyvoice2_') and f.endswith('.wav'): print(f" - {f}") except Exception as e: print(f"\nError: {e}") import traceback traceback.print_exc() if "CosyVoice2" in str(e): print("\n⚠️ CosyVoice2 may not be available. Trying CosyVoice 1.0...") elif "downloading" in str(e).lower(): print("\n⚠️ Model download in progress. Please wait...")