File size: 3,450 Bytes
0ea7b2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python3
import os
import sys

print("=== Basic CosyVoice TTS Test ===\n")

# Check git clone status
git_model = "pretrained_models/CosyVoice-300M-git"
if os.path.exists(git_model):
    print(f"Checking {git_model}...")
    files = os.listdir(git_model)
    print(f"Found {len(files)} files")
    
    # Look for key files
    key_files = ['speech_tokenizer_v1.onnx', 'flow.pt', 'campplus.onnx', 'cosyvoice.yaml']
    for kf in key_files:
        if kf in files:
            size = os.path.getsize(os.path.join(git_model, kf)) / (1024*1024)
            print(f"  ✓ {kf} ({size:.1f} MB)")
        else:
            print(f"  ✗ {kf} (missing)")

# Try basic import test
sys.path.append('third_party/Matcha-TTS')

try:
    print("\nTesting imports...")
    import torch
    import torchaudio
    import onnxruntime
    from cosyvoice.utils.file_utils import load_wav
    print("✓ All imports successful")
    
    # If git model is complete, use it
    if os.path.exists(os.path.join(git_model, 'speech_tokenizer_v1.onnx')):
        print(f"\nUsing model from: {git_model}")
        
        from cosyvoice.cli.cosyvoice import CosyVoice
        
        # Initialize with git model
        cosyvoice = CosyVoice(git_model, load_jit=False, load_trt=False, fp16=False)
        
        # Load example audio
        prompt_speech = load_wav('asset/zero_shot_prompt.wav', 16000)
        
        # Simple test
        text = "Teste de síntese de voz. Um, dois, três, testando!"
        prompt = "Olá, teste."
        
        print(f"\nGenerating: '{text}'")
        
        output_count = 0
        for i, j in enumerate(cosyvoice.inference_zero_shot(text, prompt, prompt_speech, stream=False)):
            output_file = f'test_basic_{i}.wav'
            torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate)
            
            # Get info
            duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
            size = os.path.getsize(output_file) / 1024
            
            print(f"✓ Generated: {output_file}")
            print(f"  Duration: {duration:.2f}s, Size: {size:.1f} KB")
            print(f"  Sample rate: {cosyvoice.sample_rate} Hz")
            output_count += 1
        
        print(f"\n✅ Success! Generated {output_count} audio file(s)")
        
        # Try to play if possible
        print("\nTo play the audio on server:")
        print(f"  aplay {output_file}")
        print("\nTo download to local machine:")
        print(f"  scp -P 40053 [email protected]:~/CosyVoice/{output_file} .")
        
    else:
        print("\n⚠️  Model files still downloading. Please wait...")
        print("You can check download progress with:")
        print("  ps aux | grep modelscope")
        
except Exception as e:
    print(f"\n❌ Error: {e}")
    
    # Specific error handling
    if "speech_tokenizer" in str(e):
        print("\n💡 The speech tokenizer is missing. Checking alternative sources...")
        
        # List all .onnx files
        print("\nSearching for ONNX files:")
        for root, dirs, files in os.walk('pretrained_models'):
            for f in files:
                if f.endswith('.onnx'):
                    print(f"  Found: {os.path.join(root, f)}")
    
    elif "No module named" in str(e):
        print("\n💡 Missing module. Make sure you're in the virtual environment:")
        print("  source cosyvoice_env/bin/activate")