File size: 4,070 Bytes
0ea7b2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3
import sys
import os

print("=== CosyVoice2 TTS Test ===\n")

# Add path
sys.path.append('third_party/Matcha-TTS')

try:
    # First, let's download CosyVoice2-0.5B
    print("Downloading CosyVoice2-0.5B model...")
    from modelscope import snapshot_download
    
    model_dir = 'pretrained_models/CosyVoice2-0.5B'
    if not os.path.exists(model_dir) or len(os.listdir(model_dir)) < 5:
        snapshot_download('iic/CosyVoice2-0.5B', local_dir=model_dir)
        print("Model downloaded!")
    else:
        print("Model already exists!")
    
    # Import CosyVoice2
    from cosyvoice.cli.cosyvoice import CosyVoice2
    from cosyvoice.utils.file_utils import load_wav
    import torchaudio
    
    print("\nInitializing CosyVoice2 model...")
    cosyvoice = CosyVoice2(model_dir, load_jit=False, load_trt=False, load_vllm=False, fp16=False)
    
    # Load prompt audio
    prompt_speech_16k = load_wav('asset/zero_shot_prompt.wav', 16000)
    
    # Test 1: Portuguese
    text_pt = "Olá! Este é um teste do CosyVoice dois. Ele suporta múltiplos idiomas incluindo português."
    prompt_pt = "Testando síntese de voz."
    
    print(f"\nGenerating Portuguese speech: '{text_pt}'")
    for i, j in enumerate(cosyvoice.inference_zero_shot(text_pt, prompt_pt, prompt_speech_16k, stream=False)):
        output_file = f'cosyvoice2_pt_{i}.wav'
        torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate)
        size_kb = os.path.getsize(output_file) / 1024
        duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
        print(f"✓ Saved: {output_file} ({size_kb:.1f} KB, {duration:.1f}s)")
    
    # Test 2: English
    text_en = "Hello! This is CosyVoice two point zero. It has improved accuracy and lower latency."
    
    print(f"\nGenerating English speech: '{text_en}'")
    for i, j in enumerate(cosyvoice.inference_zero_shot(text_en, "Testing voice synthesis.", prompt_speech_16k, stream=False)):
        output_file = f'cosyvoice2_en_{i}.wav'
        torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate)
        size_kb = os.path.getsize(output_file) / 1024
        duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
        print(f"✓ Saved: {output_file} ({size_kb:.1f} KB, {duration:.1f}s)")
    
    # Test 3: Chinese
    text_zh = "你好!这是通义语音大模型第二代,支持超低延迟的流式合成。"
    
    print(f"\nGenerating Chinese speech: '{text_zh}'")
    for i, j in enumerate(cosyvoice.inference_zero_shot(text_zh, "测试语音合成。", prompt_speech_16k, stream=False)):
        output_file = f'cosyvoice2_zh_{i}.wav'
        torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate)
        size_kb = os.path.getsize(output_file) / 1024
        duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
        print(f"✓ Saved: {output_file} ({size_kb:.1f} KB, {duration:.1f}s)")
    
    # Test 4: Streaming mode
    print(f"\nTesting streaming mode...")
    text_stream = "Este é um teste do modo de streaming, que permite síntese em tempo real com latência muito baixa."
    
    print("Generating chunks:")
    chunk_count = 0
    for i, j in enumerate(cosyvoice.inference_zero_shot(text_stream, prompt_pt, prompt_speech_16k, stream=True)):
        chunk_count += 1
        print(f"  Chunk {chunk_count}: {j['tts_speech'].shape[1]/cosyvoice.sample_rate:.2f}s")
    
    print(f"\n✓ CosyVoice2 test completed successfully!")
    print(f"Generated {chunk_count} streaming chunks")
    
    # List generated files
    print("\nGenerated files:")
    for f in sorted(os.listdir('.')):
        if f.startswith('cosyvoice2_') and f.endswith('.wav'):
            print(f"  - {f}")
    
except Exception as e:
    print(f"\nError: {e}")
    import traceback
    traceback.print_exc()
    
    if "CosyVoice2" in str(e):
        print("\n⚠️  CosyVoice2 may not be available. Trying CosyVoice 1.0...")
    elif "downloading" in str(e).lower():
        print("\n⚠️  Model download in progress. Please wait...")