Spaces:
Configuration error
Configuration error
update vc code
Browse files- README.md +8 -8
- cosyvoice/cli/cosyvoice.py +0 -2
- cosyvoice/cli/frontend.py +0 -2
- cosyvoice/cli/model.py +3 -4
README.md
CHANGED
|
@@ -71,6 +71,7 @@ If you are expert in this field, and you are only interested in training your ow
|
|
| 71 |
# SDK模型下载
|
| 72 |
from modelscope import snapshot_download
|
| 73 |
snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
|
|
|
|
| 74 |
snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
|
| 75 |
snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
|
| 76 |
snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
|
|
@@ -80,6 +81,7 @@ snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice
|
|
| 80 |
# git模型下载,请确保已安装git lfs
|
| 81 |
mkdir -p pretrained_models
|
| 82 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
|
|
|
|
| 83 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
|
| 84 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
|
| 85 |
git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
|
|
@@ -118,7 +120,7 @@ print(cosyvoice.list_avaliable_spks())
|
|
| 118 |
for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
|
| 119 |
torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050)
|
| 120 |
|
| 121 |
-
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz')
|
| 122 |
# zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
|
| 123 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
| 124 |
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
|
|
@@ -127,18 +129,16 @@ for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来
|
|
| 127 |
prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
| 128 |
for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
|
| 129 |
torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050)
|
| 130 |
-
|
| 131 |
-
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
|
| 132 |
-
# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
|
| 133 |
-
for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
|
| 134 |
-
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050)
|
| 135 |
-
|
| 136 |
-
cosyvoice = CosyVoice('pretrained_models/CosyVoice-VC')
|
| 137 |
# vc usage
|
| 138 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
| 139 |
source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
| 140 |
for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
|
| 141 |
torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], 22050)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
```
|
| 143 |
|
| 144 |
**Start web demo**
|
|
|
|
| 71 |
# SDK模型下载
|
| 72 |
from modelscope import snapshot_download
|
| 73 |
snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
|
| 74 |
+
snapshot_download('iic/CosyVoice-300M-25Hz', local_dir='pretrained_models/CosyVoice-300M-25Hz')
|
| 75 |
snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
|
| 76 |
snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
|
| 77 |
snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
|
|
|
|
| 81 |
# git模型下载,请确保已安装git lfs
|
| 82 |
mkdir -p pretrained_models
|
| 83 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
|
| 84 |
+
git clone https://www.modelscope.cn/iic/CosyVoice-300M-25Hz.git pretrained_models/CosyVoice-300M-25Hz
|
| 85 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
|
| 86 |
git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
|
| 87 |
git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
|
|
|
|
| 120 |
for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
|
| 121 |
torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050)
|
| 122 |
|
| 123 |
+
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz') # or change to pretrained_models/CosyVoice-300M for 50Hz inference
|
| 124 |
# zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
|
| 125 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
| 126 |
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
|
|
|
|
| 129 |
prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
| 130 |
for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
|
| 131 |
torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
# vc usage
|
| 133 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
| 134 |
source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
| 135 |
for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
|
| 136 |
torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], 22050)
|
| 137 |
+
|
| 138 |
+
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
|
| 139 |
+
# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
|
| 140 |
+
for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
|
| 141 |
+
torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050)
|
| 142 |
```
|
| 143 |
|
| 144 |
**Start web demo**
|
cosyvoice/cli/cosyvoice.py
CHANGED
|
@@ -25,7 +25,6 @@ class CosyVoice:
|
|
| 25 |
|
| 26 |
def __init__(self, model_dir, load_jit=True, load_onnx=False):
|
| 27 |
instruct = True if '-Instruct' in model_dir else False
|
| 28 |
-
vc = True if '-VC' in model_dir else False
|
| 29 |
self.model_dir = model_dir
|
| 30 |
if not os.path.exists(model_dir):
|
| 31 |
model_dir = snapshot_download(model_dir)
|
|
@@ -37,7 +36,6 @@ class CosyVoice:
|
|
| 37 |
'{}/speech_tokenizer_v1.onnx'.format(model_dir),
|
| 38 |
'{}/spk2info.pt'.format(model_dir),
|
| 39 |
instruct,
|
| 40 |
-
vc,
|
| 41 |
configs['allowed_special'])
|
| 42 |
self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
|
| 43 |
self.model.load('{}/llm.pt'.format(model_dir),
|
|
|
|
| 25 |
|
| 26 |
def __init__(self, model_dir, load_jit=True, load_onnx=False):
|
| 27 |
instruct = True if '-Instruct' in model_dir else False
|
|
|
|
| 28 |
self.model_dir = model_dir
|
| 29 |
if not os.path.exists(model_dir):
|
| 30 |
model_dir = snapshot_download(model_dir)
|
|
|
|
| 36 |
'{}/speech_tokenizer_v1.onnx'.format(model_dir),
|
| 37 |
'{}/spk2info.pt'.format(model_dir),
|
| 38 |
instruct,
|
|
|
|
| 39 |
configs['allowed_special'])
|
| 40 |
self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
|
| 41 |
self.model.load('{}/llm.pt'.format(model_dir),
|
cosyvoice/cli/frontend.py
CHANGED
|
@@ -42,7 +42,6 @@ class CosyVoiceFrontEnd:
|
|
| 42 |
speech_tokenizer_model: str,
|
| 43 |
spk2info: str = '',
|
| 44 |
instruct: bool = False,
|
| 45 |
-
vc: bool = False,
|
| 46 |
allowed_special: str = 'all'):
|
| 47 |
self.tokenizer = get_tokenizer()
|
| 48 |
self.feat_extractor = feat_extractor
|
|
@@ -59,7 +58,6 @@ class CosyVoiceFrontEnd:
|
|
| 59 |
else:
|
| 60 |
self.spk2info = {}
|
| 61 |
self.instruct = instruct
|
| 62 |
-
self.vc = vc
|
| 63 |
self.allowed_special = allowed_special
|
| 64 |
self.inflect_parser = inflect.engine()
|
| 65 |
self.use_ttsfrd = use_ttsfrd
|
|
|
|
| 42 |
speech_tokenizer_model: str,
|
| 43 |
spk2info: str = '',
|
| 44 |
instruct: bool = False,
|
|
|
|
| 45 |
allowed_special: str = 'all'):
|
| 46 |
self.tokenizer = get_tokenizer()
|
| 47 |
self.feat_extractor = feat_extractor
|
|
|
|
| 58 |
else:
|
| 59 |
self.spk2info = {}
|
| 60 |
self.instruct = instruct
|
|
|
|
| 61 |
self.allowed_special = allowed_special
|
| 62 |
self.inflect_parser = inflect.engine()
|
| 63 |
self.use_ttsfrd = use_ttsfrd
|
cosyvoice/cli/model.py
CHANGED
|
@@ -54,10 +54,9 @@ class CosyVoiceModel:
|
|
| 54 |
self.hift_cache_dict = {}
|
| 55 |
|
| 56 |
def load(self, llm_model, flow_model, hift_model):
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
self.llm.half()
|
| 61 |
self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
|
| 62 |
self.flow.to(self.device).eval()
|
| 63 |
self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
|
|
|
|
| 54 |
self.hift_cache_dict = {}
|
| 55 |
|
| 56 |
def load(self, llm_model, flow_model, hift_model):
|
| 57 |
+
self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
|
| 58 |
+
self.llm.to(self.device).eval()
|
| 59 |
+
self.llm.half()
|
|
|
|
| 60 |
self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
|
| 61 |
self.flow.to(self.device).eval()
|
| 62 |
self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))
|