Spaces:
Configuration error
Configuration error
Marcos Remar
commited on
Commit
·
08991d5
1
Parent(s):
0f60c53
Add CosyVoice2-0.5B test scripts and download script
Browse files- compare_models_english.py +32 -0
- download_cosyvoice2.py +8 -0
- test_cosyvoice2_english.py +52 -0
compare_models_english.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#\!/usr/bin/env python3
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
|
5 |
+
print("=== Comparação: CosyVoice-300M vs CosyVoice2-0.5B ===")
|
6 |
+
|
7 |
+
# Texto de teste em inglês
|
8 |
+
test_text = "Hello\! This is a comparison test between CosyVoice models. We are testing English pronunciation quality."
|
9 |
+
|
10 |
+
# Teste com modelo atual (300M)
|
11 |
+
print("\n1. Testando modelo CosyVoice-300M-direct...")
|
12 |
+
try:
|
13 |
+
from cosyvoice.cli.cosyvoice import CosyVoice
|
14 |
+
import torchaudio
|
15 |
+
|
16 |
+
model1 = CosyVoice('pretrained_models/CosyVoice-300M-direct', load_jit=False, load_trt=False, fp16=False)
|
17 |
+
|
18 |
+
start = time.time()
|
19 |
+
for i, j in enumerate(model1.inference_zero_shot(test_text, "Testing speech synthesis.", None, stream=False)):
|
20 |
+
torchaudio.save('english_300m.wav', j['tts_speech'], model1.sample_rate)
|
21 |
+
break
|
22 |
+
print(f"✅ Gerado em {time.time()-start:.1f}s - Arquivo: english_300m.wav")
|
23 |
+
|
24 |
+
except Exception as e:
|
25 |
+
print(f"❌ Erro no modelo 300M: {e}")
|
26 |
+
|
27 |
+
# Teste com novo modelo (0.5B) - quando estiver disponível
|
28 |
+
print("\n2. Modelo CosyVoice2-0.5B...")
|
29 |
+
if os.path.exists('pretrained_models/CosyVoice2-0.5B/llm.pt'):
|
30 |
+
print("✅ Modelo baixado\! Pronto para testar.")
|
31 |
+
else:
|
32 |
+
print("⏳ Ainda baixando... Execute novamente após o download.")
|
download_cosyvoice2.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from modelscope import snapshot_download
|
2 |
+
import os
|
3 |
+
|
4 |
+
print("Baixando CosyVoice2-0.5B (modelo mais recente)...")
|
5 |
+
snapshot_download('iic/CosyVoice2-0.5B',
|
6 |
+
local_dir='pretrained_models/CosyVoice2-0.5B',
|
7 |
+
cache_dir='./model_cache')
|
8 |
+
print("Download concluído\!")
|
test_cosyvoice2_english.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#\!/usr/bin/env python3
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
import time
|
5 |
+
|
6 |
+
print("=== CosyVoice2-0.5B English Test ===")
|
7 |
+
|
8 |
+
# Configurar ambiente
|
9 |
+
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
10 |
+
|
11 |
+
try:
|
12 |
+
from cosyvoice.cli.cosyvoice import CosyVoice
|
13 |
+
import torchaudio
|
14 |
+
|
15 |
+
# Usar o novo modelo CosyVoice2-0.5B
|
16 |
+
model_path = 'pretrained_models/CosyVoice2-0.5B'
|
17 |
+
|
18 |
+
# Verificar se o modelo existe
|
19 |
+
if not os.path.exists(model_path):
|
20 |
+
print(f"❌ Modelo ainda não baixado em {model_path}")
|
21 |
+
print("Aguarde o download terminar...")
|
22 |
+
sys.exit(1)
|
23 |
+
|
24 |
+
print("Carregando CosyVoice2-0.5B...")
|
25 |
+
start = time.time()
|
26 |
+
cosyvoice = CosyVoice(model_path, load_jit=False, load_trt=False, fp16=False)
|
27 |
+
print(f"✅ Modelo carregado em {time.time()-start:.1f}s")
|
28 |
+
|
29 |
+
# Teste em inglês
|
30 |
+
text = "Hello\! This is CosyVoice version two point five B. The new model has better streaming performance and improved pronunciation for English text to speech synthesis."
|
31 |
+
prompt_text = "Welcome to the demonstration of our advanced speech synthesis system."
|
32 |
+
|
33 |
+
print(f"\nTexto: {text}")
|
34 |
+
print("Gerando áudio...")
|
35 |
+
|
36 |
+
start = time.time()
|
37 |
+
output_file = "cosyvoice2_english_test.wav"
|
38 |
+
for i, j in enumerate(cosyvoice.inference_zero_shot(text, prompt_text, None, stream=False)):
|
39 |
+
torchaudio.save(output_file, j['tts_speech'], cosyvoice.sample_rate)
|
40 |
+
break
|
41 |
+
|
42 |
+
duration = time.time() - start
|
43 |
+
print(f"✅ Áudio gerado em {duration:.1f}s")
|
44 |
+
|
45 |
+
if os.path.exists(output_file):
|
46 |
+
size = os.path.getsize(output_file) / 1024
|
47 |
+
print(f"📊 Arquivo: {output_file} ({size:.1f} KB)")
|
48 |
+
|
49 |
+
except Exception as e:
|
50 |
+
print(f"❌ Erro: {e}")
|
51 |
+
import traceback
|
52 |
+
traceback.print_exc()
|