Spaces:

marcosremar2
/

cosyvoice

Configuration error

cosyvoice / test_cosyvoice2.py

Marcos Remar

Initial CosyVoice code without binary files

0ea7b2a about 1 month ago

4.07 kB

	#!/usr/bin/env python3
	import sys
	import os

	print("=== CosyVoice2 TTS Test ===\n")

	# Add path
	sys.path.append('third_party/Matcha-TTS')

	try:
	# First, let's download CosyVoice2-0.5B
	print("Downloading CosyVoice2-0.5B model...")
	from modelscope import snapshot_download

	model_dir = 'pretrained_models/CosyVoice2-0.5B'
	if not os.path.exists(model_dir) or len(os.listdir(model_dir)) < 5:
	snapshot_download('iic/CosyVoice2-0.5B', local_dir=model_dir)
	print("Model downloaded!")
	else:
	print("Model already exists!")

	# Import CosyVoice2
	from cosyvoice.cli.cosyvoice import CosyVoice2
	from cosyvoice.utils.file_utils import load_wav
	import torchaudio

	print("\nInitializing CosyVoice2 model...")
	cosyvoice = CosyVoice2(model_dir, load_jit=False, load_trt=False, load_vllm=False, fp16=False)

	# Load prompt audio
	prompt_speech_16k = load_wav('asset/zero_shot_prompt.wav', 16000)

	# Test 1: Portuguese
	text_pt = "Olá! Este é um teste do CosyVoice dois. Ele suporta múltiplos idiomas incluindo português."
	prompt_pt = "Testando síntese de voz."

	print(f"\nGenerating Portuguese speech: '{text_pt}'")
	for i, j in enumerate(cosyvoice.inference_zero_shot(text_pt, prompt_pt, prompt_speech_16k, stream=False)):
	output_file = f'cosyvoice2_pt_{i}.wav'
	torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate)
	size_kb = os.path.getsize(output_file) / 1024
	duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
	print(f"✓ Saved: {output_file} ({size_kb:.1f} KB, {duration:.1f}s)")

	# Test 2: English
	text_en = "Hello! This is CosyVoice two point zero. It has improved accuracy and lower latency."

	print(f"\nGenerating English speech: '{text_en}'")
	for i, j in enumerate(cosyvoice.inference_zero_shot(text_en, "Testing voice synthesis.", prompt_speech_16k, stream=False)):
	output_file = f'cosyvoice2_en_{i}.wav'
	torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate)
	size_kb = os.path.getsize(output_file) / 1024
	duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
	print(f"✓ Saved: {output_file} ({size_kb:.1f} KB, {duration:.1f}s)")

	# Test 3: Chinese
	text_zh = "你好！这是通义语音大模型第二代，支持超低延迟的流式合成。"

	print(f"\nGenerating Chinese speech: '{text_zh}'")
	for i, j in enumerate(cosyvoice.inference_zero_shot(text_zh, "测试语音合成。", prompt_speech_16k, stream=False)):
	output_file = f'cosyvoice2_zh_{i}.wav'
	torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate)
	size_kb = os.path.getsize(output_file) / 1024
	duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
	print(f"✓ Saved: {output_file} ({size_kb:.1f} KB, {duration:.1f}s)")

	# Test 4: Streaming mode
	print(f"\nTesting streaming mode...")
	text_stream = "Este é um teste do modo de streaming, que permite síntese em tempo real com latência muito baixa."

	print("Generating chunks:")
	chunk_count = 0
	for i, j in enumerate(cosyvoice.inference_zero_shot(text_stream, prompt_pt, prompt_speech_16k, stream=True)):
	chunk_count += 1
	print(f" Chunk {chunk_count}: {j['tts_speech'].shape[1]/cosyvoice.sample_rate:.2f}s")

	print(f"\n✓ CosyVoice2 test completed successfully!")
	print(f"Generated {chunk_count} streaming chunks")

	# List generated files
	print("\nGenerated files:")
	for f in sorted(os.listdir('.')):
	if f.startswith('cosyvoice2_') and f.endswith('.wav'):
	print(f" - {f}")

	except Exception as e:
	print(f"\nError: {e}")
	import traceback
	traceback.print_exc()

	if "CosyVoice2" in str(e):
	print("\n⚠️ CosyVoice2 may not be available. Trying CosyVoice 1.0...")
	elif "downloading" in str(e).lower():
	print("\n⚠️ Model download in progress. Please wait...")