cosyvoice / english_tts_test_timed.py
Marcos Remar
Add test scripts for CosyVoice 1.0 (300M model)
b65e164
#\!/usr/bin/env python3
import os
import sys
import time
# Configurar ambiente
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
print("=== CosyVoice English TTS Test with Timing ===")
print()
start_time = time.time()
try:
from cosyvoice.cli.cosyvoice import CosyVoice
import torchaudio
model_path = 'pretrained_models/CosyVoice-300M-direct'
# Verificar se o modelo existe
if not os.path.exists(model_path):
print(f"❌ Error: Model not found at {model_path}")
sys.exit(1)
# Medir tempo de carregamento do modelo
load_start = time.time()
print("Loading CosyVoice model...")
cosyvoice = CosyVoice(model_path, load_jit=False, load_trt=False, fp16=False)
load_time = time.time() - load_start
print(f"✅ Model loaded in {load_time:.2f} seconds")
print()
# Texto em inglês para síntese
text = "Hello\! This is a test of the CosyVoice text-to-speech system. The synthesis is working perfectly and generating high quality audio."
prompt_text = "Welcome to the speech synthesis demonstration."
print(f"Text: {text}")
print(f"Prompt: {prompt_text}")
print()
# Medir tempo de geração
gen_start = time.time()
print("Generating audio...")
output_file = "english_test_output.wav"
for i, j in enumerate(cosyvoice.inference_zero_shot(text, prompt_text, None, stream=False)):
torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate)
break
gen_time = time.time() - gen_start
print(f"✅ Audio generated in {gen_time:.2f} seconds")
# Verificar arquivo gerado
if os.path.exists(output_file):
size = os.path.getsize(output_file)
duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
print()
print(f"📊 File statistics:")
print(f" - Filename: {output_file}")
print(f" - Size: {size/1024:.1f} KB")
print(f" - Duration: {duration:.2f} seconds")
print(f" - Sample rate: {cosyvoice.sample_rate} Hz")
total_time = time.time() - start_time
print()
print(f"⏱️ Total execution time: {total_time:.2f} seconds")
print(f" - Model loading: {load_time:.2f}s ({load_time/total_time*100:.1f}%)")
print(f" - Audio generation: {gen_time:.2f}s ({gen_time/total_time*100:.1f}%)")
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
total_time = time.time() - start_time
print(f"\nTotal time before error: {total_time:.2f} seconds")