Spaces:
Configuration error
Configuration error
#!/usr/bin/env python3 | |
import sys | |
import os | |
import time | |
print("=== CosyVoice Text-to-Speech Test ===\n") | |
# Add Matcha-TTS to path | |
sys.path.append('third_party/Matcha-TTS') | |
# Wait for model files | |
required_files = [ | |
'pretrained_models/CosyVoice-300M/flow.pt', | |
'pretrained_models/CosyVoice-300M/speech_tokenizer_v1.onnx', | |
'pretrained_models/CosyVoice-300M/campplus.onnx' | |
] | |
print("Checking for required model files...") | |
max_wait = 300 # 5 minutes | |
start_time = time.time() | |
while True: | |
missing_files = [f for f in required_files if not os.path.exists(f)] | |
if not missing_files: | |
print("All required files found!") | |
break | |
elapsed = time.time() - start_time | |
if elapsed > max_wait: | |
print(f"Timeout waiting for files. Missing: {missing_files}") | |
sys.exit(1) | |
print(f"Waiting for downloads to complete... ({int(elapsed)}s elapsed)") | |
time.sleep(10) | |
try: | |
from cosyvoice.cli.cosyvoice import CosyVoice | |
from cosyvoice.utils.file_utils import load_wav | |
import torchaudio | |
print("\n1. Testing Zero-Shot Voice Cloning") | |
print("-" * 40) | |
# Initialize model | |
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M', load_jit=False, load_trt=False, fp16=False) | |
# Test 1: Zero-shot with provided prompt audio | |
if os.path.exists('asset/zero_shot_prompt.wav'): | |
prompt_speech_16k = load_wav('asset/zero_shot_prompt.wav', 16000) | |
text_pt = "Olá, este é um teste de síntese de voz em português brasileiro." | |
prompt_pt = "Testando a conversão de texto para fala." | |
print(f"Generating Portuguese speech: '{text_pt}'") | |
for i, j in enumerate(cosyvoice.inference_zero_shot(text_pt, prompt_pt, prompt_speech_16k, stream=False)): | |
output_file = f'output_portuguese_{i}.wav' | |
torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate) | |
print(f"✓ Saved: {output_file}") | |
# Test 2: Chinese speech | |
text_zh = "你好,我是通义生成式语音大模型,很高兴为您服务。" | |
prompt_zh = "希望这个测试能够成功。" | |
print(f"\nGenerating Chinese speech: '{text_zh}'") | |
if os.path.exists('asset/zero_shot_prompt.wav'): | |
for i, j in enumerate(cosyvoice.inference_zero_shot(text_zh, prompt_zh, prompt_speech_16k, stream=False)): | |
output_file = f'output_chinese_{i}.wav' | |
torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate) | |
print(f"✓ Saved: {output_file}") | |
# Test 3: English speech | |
text_en = "Hello, this is a test of the CosyVoice text-to-speech system. It supports multiple languages." | |
prompt_en = "Testing voice synthesis." | |
print(f"\nGenerating English speech: '{text_en}'") | |
if os.path.exists('asset/zero_shot_prompt.wav'): | |
for i, j in enumerate(cosyvoice.inference_zero_shot(text_en, prompt_en, prompt_speech_16k, stream=False)): | |
output_file = f'output_english_{i}.wav' | |
torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate) | |
print(f"✓ Saved: {output_file}") | |
print("\n2. Testing SFT Mode (if available)") | |
print("-" * 40) | |
# Try SFT model if available | |
if os.path.exists('pretrained_models/CosyVoice-300M-SFT/cosyvoice.yaml'): | |
try: | |
cosyvoice_sft = CosyVoice('pretrained_models/CosyVoice-300M-SFT', load_jit=False, load_trt=False, fp16=False) | |
# List available speakers | |
speakers = cosyvoice_sft.list_available_spks() | |
print(f"Available speakers: {speakers[:5]}...") # Show first 5 | |
# Generate with first available speaker | |
if speakers: | |
text = "Este é um teste usando o modelo SFT com falantes pré-definidos." | |
speaker = speakers[0] | |
print(f"\nGenerating with speaker '{speaker}': '{text}'") | |
for i, j in enumerate(cosyvoice_sft.inference_sft(text, speaker, stream=False)): | |
output_file = f'output_sft_{i}.wav' | |
torchaudio.save(output_file, j['tts_speech'], cosyvoice_sft.sample_rate) | |
print(f"✓ Saved: {output_file}") | |
except Exception as e: | |
print(f"SFT model not ready yet: {e}") | |
print("\n=== Test completed successfully! ===") | |
print("\nGenerated audio files:") | |
for f in os.listdir('.'): | |
if f.startswith('output_') and f.endswith('.wav'): | |
size = os.path.getsize(f) / 1024 | |
print(f" - {f} ({size:.1f} KB)") | |
except Exception as e: | |
print(f"\nError during test: {e}") | |
import traceback | |
traceback.print_exc() |