Spaces:

marcosremar2
/

cosyvoice

Configuration error

cosyvoice / final_tts_test.py

Marcos Remar

Initial CosyVoice code without binary files

0ea7b2a about 2 months ago

3.92 kB

	#!/usr/bin/env python3
	import sys
	sys.path.append('third_party/Matcha-TTS')

	print("=== CosyVoice TTS Final Test ===\n")

	try:
	from cosyvoice.cli.cosyvoice import CosyVoice
	from cosyvoice.utils.file_utils import load_wav
	import torchaudio
	import os

	# Use the complete model
	model_path = 'pretrained_models/CosyVoice-300M-direct'

	print(f"Loading model from: {model_path}")
	cosyvoice = CosyVoice(model_path, load_jit=False, load_trt=False, fp16=False)
	print("✓ Model loaded successfully!")

	# Load prompt audio
	prompt_speech_16k = load_wav('asset/zero_shot_prompt.wav', 16000)
	print("✓ Prompt audio loaded")

	# Test different languages
	tests = [
	{
	'lang': 'Portuguese',
	'text': 'Olá! Bem-vindo ao teste de síntese de voz do CosyVoice. Este sistema converte texto em fala de forma natural.',
	'prompt': 'Teste de voz em português.',
	'output': 'output_portuguese.wav'
	},
	{
	'lang': 'English',
	'text': 'Hello! This is a test of the CosyVoice text-to-speech system. It can generate natural sounding speech.',
	'prompt': 'Testing English speech synthesis.',
	'output': 'output_english.wav'
	},
	{
	'lang': 'Chinese',
	'text': '你好！这是CosyVoice语音合成系统的测试。它可以生成自然的语音。',
	'prompt': '测试中文语音合成。',
	'output': 'output_chinese.wav'
	}
	]

	for test in tests:
	print(f"\n{'='*50}")
	print(f"Testing {test['lang']}:")
	print(f"Text: {test['text']}")
	print(f"Generating speech...")

	# Generate speech
	for i, j in enumerate(cosyvoice.inference_zero_shot(
	test['text'],
	test['prompt'],
	prompt_speech_16k,
	stream=False
	)):
	# Save audio
	torchaudio.save(test['output'], j['tts_speech'], cosyvoice.sample_rate)

	# Show info
	duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
	size_kb = os.path.getsize(test['output']) / 1024

	print(f"✓ Generated: {test['output']}")
	print(f" Duration: {duration:.2f} seconds")
	print(f" Size: {size_kb:.1f} KB")
	print(f" Sample rate: {cosyvoice.sample_rate} Hz")

	# Test streaming mode
	print(f"\n{'='*50}")
	print("Testing streaming mode:")
	text = "Este é um teste do modo streaming, que permite gerar áudio em tempo real com baixa latência."
	print(f"Text: {text}")

	chunk_count = 0
	total_duration = 0
	for i, j in enumerate(cosyvoice.inference_zero_shot(text, "Modo streaming.", prompt_speech_16k, stream=True)):
	chunk_count += 1
	chunk_duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
	total_duration += chunk_duration
	print(f" Chunk {chunk_count}: {chunk_duration:.3f}s")

	# Save first chunk as example
	if chunk_count == 1:
	torchaudio.save('output_streaming_chunk1.wav', j['tts_speech'], cosyvoice.sample_rate)

	print(f"✓ Generated {chunk_count} chunks, total duration: {total_duration:.2f}s")

	print(f"\n{'='*50}")
	print("✅ All tests completed successfully!")

	# Summary
	print("\n📁 Generated files:")
	for f in sorted(os.listdir('.')):
	if f.startswith('output_') and f.endswith('.wav'):
	size = os.path.getsize(f) / 1024
	print(f" - {f} ({size:.1f} KB)")

	print("\n💡 To download files to your local machine:")
	print("scp -P 40053 -i ~/.ssh/id_ed25519 [email protected]:~/CosyVoice/output_*.wav .")

	except Exception as e:
	print(f"\n❌ Error: {e}")
	import traceback
	traceback.print_exc()