cosyvoice / test_cosyvoice2.py
Marcos Remar
Initial CosyVoice code without binary files
0ea7b2a
#!/usr/bin/env python3
import sys
import os
print("=== CosyVoice2 TTS Test ===\n")
# Add path
sys.path.append('third_party/Matcha-TTS')
try:
# First, let's download CosyVoice2-0.5B
print("Downloading CosyVoice2-0.5B model...")
from modelscope import snapshot_download
model_dir = 'pretrained_models/CosyVoice2-0.5B'
if not os.path.exists(model_dir) or len(os.listdir(model_dir)) < 5:
snapshot_download('iic/CosyVoice2-0.5B', local_dir=model_dir)
print("Model downloaded!")
else:
print("Model already exists!")
# Import CosyVoice2
from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.utils.file_utils import load_wav
import torchaudio
print("\nInitializing CosyVoice2 model...")
cosyvoice = CosyVoice2(model_dir, load_jit=False, load_trt=False, load_vllm=False, fp16=False)
# Load prompt audio
prompt_speech_16k = load_wav('asset/zero_shot_prompt.wav', 16000)
# Test 1: Portuguese
text_pt = "Olá! Este é um teste do CosyVoice dois. Ele suporta múltiplos idiomas incluindo português."
prompt_pt = "Testando síntese de voz."
print(f"\nGenerating Portuguese speech: '{text_pt}'")
for i, j in enumerate(cosyvoice.inference_zero_shot(text_pt, prompt_pt, prompt_speech_16k, stream=False)):
output_file = f'cosyvoice2_pt_{i}.wav'
torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate)
size_kb = os.path.getsize(output_file) / 1024
duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
print(f"✓ Saved: {output_file} ({size_kb:.1f} KB, {duration:.1f}s)")
# Test 2: English
text_en = "Hello! This is CosyVoice two point zero. It has improved accuracy and lower latency."
print(f"\nGenerating English speech: '{text_en}'")
for i, j in enumerate(cosyvoice.inference_zero_shot(text_en, "Testing voice synthesis.", prompt_speech_16k, stream=False)):
output_file = f'cosyvoice2_en_{i}.wav'
torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate)
size_kb = os.path.getsize(output_file) / 1024
duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
print(f"✓ Saved: {output_file} ({size_kb:.1f} KB, {duration:.1f}s)")
# Test 3: Chinese
text_zh = "你好!这是通义语音大模型第二代,支持超低延迟的流式合成。"
print(f"\nGenerating Chinese speech: '{text_zh}'")
for i, j in enumerate(cosyvoice.inference_zero_shot(text_zh, "测试语音合成。", prompt_speech_16k, stream=False)):
output_file = f'cosyvoice2_zh_{i}.wav'
torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate)
size_kb = os.path.getsize(output_file) / 1024
duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
print(f"✓ Saved: {output_file} ({size_kb:.1f} KB, {duration:.1f}s)")
# Test 4: Streaming mode
print(f"\nTesting streaming mode...")
text_stream = "Este é um teste do modo de streaming, que permite síntese em tempo real com latência muito baixa."
print("Generating chunks:")
chunk_count = 0
for i, j in enumerate(cosyvoice.inference_zero_shot(text_stream, prompt_pt, prompt_speech_16k, stream=True)):
chunk_count += 1
print(f" Chunk {chunk_count}: {j['tts_speech'].shape[1]/cosyvoice.sample_rate:.2f}s")
print(f"\n✓ CosyVoice2 test completed successfully!")
print(f"Generated {chunk_count} streaming chunks")
# List generated files
print("\nGenerated files:")
for f in sorted(os.listdir('.')):
if f.startswith('cosyvoice2_') and f.endswith('.wav'):
print(f" - {f}")
except Exception as e:
print(f"\nError: {e}")
import traceback
traceback.print_exc()
if "CosyVoice2" in str(e):
print("\n⚠️ CosyVoice2 may not be available. Trying CosyVoice 1.0...")
elif "downloading" in str(e).lower():
print("\n⚠️ Model download in progress. Please wait...")