File size: 3,060 Bytes
0ea7b2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
import sys
sys.path.append('third_party/Matcha-TTS')

print("=== Simple CosyVoice TTS Test ===\n")

try:
    # Check what models are available
    import os
    print("Available models:")
    models_dir = 'pretrained_models'
    for model in os.listdir(models_dir):
        model_path = os.path.join(models_dir, model)
        if os.path.isdir(model_path):
            files = os.listdir(model_path)
            print(f"\n{model}:")
            for f in files[:10]:  # Show first 10 files
                print(f"  - {f}")
    
    # Try to use CosyVoice-300M which seems complete
    from cosyvoice.cli.cosyvoice import CosyVoice
    from cosyvoice.utils.file_utils import load_wav
    import torchaudio
    
    print("\nInitializing CosyVoice-300M model...")
    
    # Check if required files exist
    model_path = 'pretrained_models/CosyVoice-300M'
    required_files = ['flow.pt', 'speech_tokenizer_v1.onnx', 'campplus.onnx']
    
    missing = []
    for f in required_files:
        full_path = os.path.join(model_path, f)
        if not os.path.exists(full_path):
            missing.append(f)
    
    if missing:
        print(f"Missing files in {model_path}: {missing}")
        print("\nTrying to extract zip files if present...")
        
        # Check for zip files
        for f in os.listdir(model_path):
            if f.endswith('.zip'):
                print(f"Found zip file: {f}")
                import zipfile
                zip_path = os.path.join(model_path, f)
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(model_path)
                print(f"Extracted: {f}")
    
    # Try again after extraction
    cosyvoice = CosyVoice(model_path, load_jit=False, load_trt=False, fp16=False)
    
    # Load prompt audio
    prompt_speech_16k = load_wav('asset/zero_shot_prompt.wav', 16000)
    
    # Generate speech
    text = "Olá, este é um teste do CosyVoice. Sistema de síntese de voz funcionando!"
    prompt_text = "Teste de voz."
    
    print(f"\nGenerating speech for: '{text}'")
    
    for i, j in enumerate(cosyvoice.inference_zero_shot(text, prompt_text, prompt_speech_16k, stream=False)):
        output_file = f'test_output_{i}.wav'
        torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate)
        print(f"✓ Generated: {output_file}")
        
        # Show file info
        size = os.path.getsize(output_file) / 1024
        duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
        print(f"  Size: {size:.1f} KB, Duration: {duration:.1f}s")
    
    print("\n✓ Test completed successfully!")
    
except Exception as e:
    print(f"\nError: {e}")
    import traceback
    traceback.print_exc()
    
    # Try to give more specific help
    if "speech_tokenizer_v1.onnx" in str(e):
        print("\n⚠️  Model files may still be downloading. Please wait and try again.")
    elif "No module named" in str(e):
        print("\n⚠️  Missing dependency. Make sure virtual environment is activated.")