Spaces:

tanbw
/

CosyVoice

Configuration error

App Files Files Community

CosyVoice commited on Sep 26, 2024

Commit

06934c3

1 Parent(s): d52358f

update vc code

Browse files

Files changed (4) hide show

README.md +8 -8
cosyvoice/cli/cosyvoice.py +0 -2
cosyvoice/cli/frontend.py +0 -2
cosyvoice/cli/model.py +3 -4

README.md CHANGED Viewed

@@ -71,6 +71,7 @@ If you are expert in this field, and you are only interested in training your ow
 # SDK模型下载
 from modelscope import snapshot_download
 snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
 snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
 snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
 snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
@@ -80,6 +81,7 @@ snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice
 # git模型下载，请确保已安装git lfs
 mkdir -p pretrained_models
 git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
 git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
 git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
 git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
@@ -118,7 +120,7 @@ print(cosyvoice.list_avaliable_spks())
 for i, j in enumerate(cosyvoice.inference_sft('你好，我是通义生成式语音大模型，请问有什么可以帮您的吗？', '中文女', stream=False)):
     torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050)
-cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz')
 # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
 prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
 for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
@@ -127,18 +129,16 @@ for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来
 prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
 for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
     torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050)
-cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
-# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
-for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时，他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
-    torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050)
-cosyvoice = CosyVoice('pretrained_models/CosyVoice-VC')
 # vc usage
 prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
 source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
 for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
     torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], 22050)
 ```
 **Start web demo**

 # SDK模型下载
 from modelscope import snapshot_download
 snapshot_download('iic/CosyVoice-300M', local_dir='pretrained_models/CosyVoice-300M')
+snapshot_download('iic/CosyVoice-300M-25Hz', local_dir='pretrained_models/CosyVoice-300M-25Hz')
 snapshot_download('iic/CosyVoice-300M-SFT', local_dir='pretrained_models/CosyVoice-300M-SFT')
 snapshot_download('iic/CosyVoice-300M-Instruct', local_dir='pretrained_models/CosyVoice-300M-Instruct')
 snapshot_download('iic/CosyVoice-ttsfrd', local_dir='pretrained_models/CosyVoice-ttsfrd')
 # git模型下载，请确保已安装git lfs
 mkdir -p pretrained_models
 git clone https://www.modelscope.cn/iic/CosyVoice-300M.git pretrained_models/CosyVoice-300M
+git clone https://www.modelscope.cn/iic/CosyVoice-300M-25Hz.git pretrained_models/CosyVoice-300M-25Hz
 git clone https://www.modelscope.cn/iic/CosyVoice-300M-SFT.git pretrained_models/CosyVoice-300M-SFT
 git clone https://www.modelscope.cn/iic/CosyVoice-300M-Instruct.git pretrained_models/CosyVoice-300M-Instruct
 git clone https://www.modelscope.cn/iic/CosyVoice-ttsfrd.git pretrained_models/CosyVoice-ttsfrd
 for i, j in enumerate(cosyvoice.inference_sft('你好，我是通义生成式语音大模型，请问有什么可以帮您的吗？', '中文女', stream=False)):
     torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050)
+cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz') # or change to pretrained_models/CosyVoice-300M for 50Hz inference
 # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
 prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
 for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
 prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
 for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
     torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050)
 # vc usage
 prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
 source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
 for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
     torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], 22050)
+cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
+# instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
+for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时，他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
+    torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050)
 ```
 **Start web demo**

cosyvoice/cli/cosyvoice.py CHANGED Viewed

@@ -25,7 +25,6 @@ class CosyVoice:
     def __init__(self, model_dir, load_jit=True, load_onnx=False):
         instruct = True if '-Instruct' in model_dir else False
-        vc = True if '-VC' in model_dir else False
         self.model_dir = model_dir
         if not os.path.exists(model_dir):
             model_dir = snapshot_download(model_dir)
@@ -37,7 +36,6 @@ class CosyVoice:
                                           '{}/speech_tokenizer_v1.onnx'.format(model_dir),
                                           '{}/spk2info.pt'.format(model_dir),
                                           instruct,
-                                          vc,
                                           configs['allowed_special'])
         self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
         self.model.load('{}/llm.pt'.format(model_dir),

     def __init__(self, model_dir, load_jit=True, load_onnx=False):
         instruct = True if '-Instruct' in model_dir else False
         self.model_dir = model_dir
         if not os.path.exists(model_dir):
             model_dir = snapshot_download(model_dir)
                                           '{}/speech_tokenizer_v1.onnx'.format(model_dir),
                                           '{}/spk2info.pt'.format(model_dir),
                                           instruct,
                                           configs['allowed_special'])
         self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'])
         self.model.load('{}/llm.pt'.format(model_dir),

cosyvoice/cli/frontend.py CHANGED Viewed

@@ -42,7 +42,6 @@ class CosyVoiceFrontEnd:
                  speech_tokenizer_model: str,
                  spk2info: str = '',
                  instruct: bool = False,
-                 vc: bool = False,
                  allowed_special: str = 'all'):
         self.tokenizer = get_tokenizer()
         self.feat_extractor = feat_extractor
@@ -59,7 +58,6 @@ class CosyVoiceFrontEnd:
         else:
             self.spk2info = {}
         self.instruct = instruct
-        self.vc = vc
         self.allowed_special = allowed_special
         self.inflect_parser = inflect.engine()
         self.use_ttsfrd = use_ttsfrd

                  speech_tokenizer_model: str,
                  spk2info: str = '',
                  instruct: bool = False,
                  allowed_special: str = 'all'):
         self.tokenizer = get_tokenizer()
         self.feat_extractor = feat_extractor
         else:
             self.spk2info = {}
         self.instruct = instruct
         self.allowed_special = allowed_special
         self.inflect_parser = inflect.engine()
         self.use_ttsfrd = use_ttsfrd

cosyvoice/cli/model.py CHANGED Viewed

@@ -54,10 +54,9 @@ class CosyVoiceModel:
         self.hift_cache_dict = {}
     def load(self, llm_model, flow_model, hift_model):
-        if self.llm is not None:
-            self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
-            self.llm.to(self.device).eval()
-            self.llm.half()
         self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
         self.flow.to(self.device).eval()
         self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))

         self.hift_cache_dict = {}
     def load(self, llm_model, flow_model, hift_model):
+        self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
+        self.llm.to(self.device).eval()
+        self.llm.half()
         self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
         self.flow.to(self.device).eval()
         self.hift.load_state_dict(torch.load(hift_model, map_location=self.device))