Spaces:
Configuration error
Configuration error
#!/usr/bin/env python3 | |
import sys | |
import os | |
print("=== CosyVoice2 TTS Test ===\n") | |
# Add path | |
sys.path.append('third_party/Matcha-TTS') | |
try: | |
# First, let's download CosyVoice2-0.5B | |
print("Downloading CosyVoice2-0.5B model...") | |
from modelscope import snapshot_download | |
model_dir = 'pretrained_models/CosyVoice2-0.5B' | |
if not os.path.exists(model_dir) or len(os.listdir(model_dir)) < 5: | |
snapshot_download('iic/CosyVoice2-0.5B', local_dir=model_dir) | |
print("Model downloaded!") | |
else: | |
print("Model already exists!") | |
# Import CosyVoice2 | |
from cosyvoice.cli.cosyvoice import CosyVoice2 | |
from cosyvoice.utils.file_utils import load_wav | |
import torchaudio | |
print("\nInitializing CosyVoice2 model...") | |
cosyvoice = CosyVoice2(model_dir, load_jit=False, load_trt=False, load_vllm=False, fp16=False) | |
# Load prompt audio | |
prompt_speech_16k = load_wav('asset/zero_shot_prompt.wav', 16000) | |
# Test 1: Portuguese | |
text_pt = "Olá! Este é um teste do CosyVoice dois. Ele suporta múltiplos idiomas incluindo português." | |
prompt_pt = "Testando síntese de voz." | |
print(f"\nGenerating Portuguese speech: '{text_pt}'") | |
for i, j in enumerate(cosyvoice.inference_zero_shot(text_pt, prompt_pt, prompt_speech_16k, stream=False)): | |
output_file = f'cosyvoice2_pt_{i}.wav' | |
torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate) | |
size_kb = os.path.getsize(output_file) / 1024 | |
duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate | |
print(f"✓ Saved: {output_file} ({size_kb:.1f} KB, {duration:.1f}s)") | |
# Test 2: English | |
text_en = "Hello! This is CosyVoice two point zero. It has improved accuracy and lower latency." | |
print(f"\nGenerating English speech: '{text_en}'") | |
for i, j in enumerate(cosyvoice.inference_zero_shot(text_en, "Testing voice synthesis.", prompt_speech_16k, stream=False)): | |
output_file = f'cosyvoice2_en_{i}.wav' | |
torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate) | |
size_kb = os.path.getsize(output_file) / 1024 | |
duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate | |
print(f"✓ Saved: {output_file} ({size_kb:.1f} KB, {duration:.1f}s)") | |
# Test 3: Chinese | |
text_zh = "你好!这是通义语音大模型第二代,支持超低延迟的流式合成。" | |
print(f"\nGenerating Chinese speech: '{text_zh}'") | |
for i, j in enumerate(cosyvoice.inference_zero_shot(text_zh, "测试语音合成。", prompt_speech_16k, stream=False)): | |
output_file = f'cosyvoice2_zh_{i}.wav' | |
torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate) | |
size_kb = os.path.getsize(output_file) / 1024 | |
duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate | |
print(f"✓ Saved: {output_file} ({size_kb:.1f} KB, {duration:.1f}s)") | |
# Test 4: Streaming mode | |
print(f"\nTesting streaming mode...") | |
text_stream = "Este é um teste do modo de streaming, que permite síntese em tempo real com latência muito baixa." | |
print("Generating chunks:") | |
chunk_count = 0 | |
for i, j in enumerate(cosyvoice.inference_zero_shot(text_stream, prompt_pt, prompt_speech_16k, stream=True)): | |
chunk_count += 1 | |
print(f" Chunk {chunk_count}: {j['tts_speech'].shape[1]/cosyvoice.sample_rate:.2f}s") | |
print(f"\n✓ CosyVoice2 test completed successfully!") | |
print(f"Generated {chunk_count} streaming chunks") | |
# List generated files | |
print("\nGenerated files:") | |
for f in sorted(os.listdir('.')): | |
if f.startswith('cosyvoice2_') and f.endswith('.wav'): | |
print(f" - {f}") | |
except Exception as e: | |
print(f"\nError: {e}") | |
import traceback | |
traceback.print_exc() | |
if "CosyVoice2" in str(e): | |
print("\n⚠️ CosyVoice2 may not be available. Trying CosyVoice 1.0...") | |
elif "downloading" in str(e).lower(): | |
print("\n⚠️ Model download in progress. Please wait...") |