Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -10,11 +10,13 @@ import numpy as np | |
| 10 | 
             
            from pydub import AudioSegment
         | 
| 11 |  | 
| 12 | 
             
            # Load model and configuration
         | 
| 13 | 
            -
            device =  | 
| 14 |  | 
| 15 | 
             
            dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
         | 
| 16 | 
             
                                                            "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
         | 
| 17 | 
             
                                                            "config_dit_mel_seed_uvit_whisper_small_wavenet.yml")
         | 
|  | |
|  | |
| 18 | 
             
            config = yaml.safe_load(open(dit_config_path, 'r'))
         | 
| 19 | 
             
            model_params = recursive_munch(config['model_params'])
         | 
| 20 | 
             
            model = build_model(model_params, stage='DiT')
         | 
| @@ -46,6 +48,19 @@ bigvgan_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_22khz_80band_ | |
| 46 | 
             
            bigvgan_model.remove_weight_norm()
         | 
| 47 | 
             
            bigvgan_model = bigvgan_model.eval().to(device)
         | 
| 48 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 49 | 
             
            # whisper
         | 
| 50 | 
             
            from transformers import AutoFeatureExtractor, WhisperModel
         | 
| 51 |  | 
| @@ -119,16 +134,12 @@ def adjust_f0_semitones(f0_sequence, n_semitones): | |
| 119 | 
             
            def crossfade(chunk1, chunk2, overlap):
         | 
| 120 | 
             
                fade_out = np.cos(np.linspace(0, np.pi / 2, overlap)) ** 2
         | 
| 121 | 
             
                fade_in = np.cos(np.linspace(np.pi / 2, 0, overlap)) ** 2
         | 
| 122 | 
            -
                 | 
| 123 | 
            -
                    chunk2[:overlap] = chunk2[:overlap] * fade_in[:len(chunk2)] + (chunk1[-overlap:] * fade_out)[:len(chunk2)]
         | 
| 124 | 
            -
                else:
         | 
| 125 | 
            -
                    chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
         | 
| 126 | 
             
                return chunk2
         | 
| 127 |  | 
| 128 | 
             
            # streaming and chunk processing related params
         | 
| 129 | 
            -
            overlap_frame_len = 16
         | 
| 130 | 
             
            bitrate = "320k"
         | 
| 131 | 
            -
             | 
| 132 | 
             
            @spaces.GPU
         | 
| 133 | 
             
            @torch.no_grad()
         | 
| 134 | 
             
            @torch.inference_mode()
         | 
| @@ -232,8 +243,8 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c | |
| 232 | 
             
                style2 = campplus_model(feat2.unsqueeze(0))
         | 
| 233 |  | 
| 234 | 
             
                if f0_condition:
         | 
| 235 | 
            -
                    F0_ori = rmvpe.infer_from_audio(ref_waves_16k[0], thred=0. | 
| 236 | 
            -
                    F0_alt = rmvpe.infer_from_audio(converted_waves_16k[0], thred=0. | 
| 237 |  | 
| 238 | 
             
                    F0_ori = torch.from_numpy(F0_ori).to(device)[None]
         | 
| 239 | 
             
                    F0_alt = torch.from_numpy(F0_alt).to(device)[None]
         | 
| @@ -272,7 +283,7 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c | |
| 272 | 
             
                    chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
         | 
| 273 | 
             
                    is_last_chunk = processed_frames + max_source_window >= cond.size(1)
         | 
| 274 | 
             
                    cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
         | 
| 275 | 
            -
                    with torch.autocast(device_type= | 
| 276 | 
             
                        # Voice Conversion
         | 
| 277 | 
             
                        vc_target = inference_module.cfm.inference(cat_condition,
         | 
| 278 | 
             
                                                                   torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
         | 
| @@ -326,7 +337,7 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c | |
| 326 |  | 
| 327 |  | 
| 328 | 
             
            if __name__ == "__main__":
         | 
| 329 | 
            -
                description = (" | 
| 330 | 
             
                               "for details and updates.<br>Note that any reference audio will be forcefully clipped to 25s if beyond this length.<br> "
         | 
| 331 | 
             
                               "If total duration of source and reference audio exceeds 30s, source audio will be processed in chunks.<br> "
         | 
| 332 | 
             
                               "无需训练的 zero-shot 语音/歌声转换模型,若需本地部署查看[GitHub页面](https://github.com/Plachtaa/seed-vc)<br>"
         | 
| @@ -334,7 +345,7 @@ if __name__ == "__main__": | |
| 334 | 
             
                inputs = [
         | 
| 335 | 
             
                    gr.Audio(type="filepath", label="Source Audio / 源音频"),
         | 
| 336 | 
             
                    gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
         | 
| 337 | 
            -
                    gr.Slider(minimum=1, maximum=200, value= | 
| 338 | 
             
                    gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjust / 长度调整", info="<1.0 for speed-up speech, >1.0 for slow-down speech / <1.0 加速语速,>1.0 减慢语速"),
         | 
| 339 | 
             
                    gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Inference CFG Rate", info="has subtle influence / 有微小影响"),
         | 
| 340 | 
             
                    gr.Checkbox(label="Use F0 conditioned model / 启用F0输入", value=False, info="Must set to true for singing voice conversion / 歌声转换时必须勾选"),
         | 
| @@ -344,11 +355,11 @@ if __name__ == "__main__": | |
| 344 | 
             
                ]
         | 
| 345 |  | 
| 346 | 
             
                examples = [["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, False, True, 0],
         | 
| 347 | 
            -
                            ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 25, 1.0, 0.7,  | 
| 348 | 
             
                            ["examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav",
         | 
| 349 | 
            -
                             "examples/reference/ | 
| 350 | 
             
                            ["examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav",
         | 
| 351 | 
            -
                             "examples/reference/trump_0.wav",  | 
| 352 | 
             
                            ]
         | 
| 353 |  | 
| 354 | 
             
                outputs = [gr.Audio(label="Stream Output Audio / 流式输出", streaming=True, format='mp3'),
         | 
|  | |
| 10 | 
             
            from pydub import AudioSegment
         | 
| 11 |  | 
| 12 | 
             
            # Load model and configuration
         | 
| 13 | 
            +
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         | 
| 14 |  | 
| 15 | 
             
            dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
         | 
| 16 | 
             
                                                            "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
         | 
| 17 | 
             
                                                            "config_dit_mel_seed_uvit_whisper_small_wavenet.yml")
         | 
| 18 | 
            +
            # dit_checkpoint_path = "E:/DiT_epoch_00018_step_801000.pth"
         | 
| 19 | 
            +
            # dit_config_path = "configs/config_dit_mel_seed_uvit_whisper_small_encoder_wavenet.yml"
         | 
| 20 | 
             
            config = yaml.safe_load(open(dit_config_path, 'r'))
         | 
| 21 | 
             
            model_params = recursive_munch(config['model_params'])
         | 
| 22 | 
             
            model = build_model(model_params, stage='DiT')
         | 
|  | |
| 48 | 
             
            bigvgan_model.remove_weight_norm()
         | 
| 49 | 
             
            bigvgan_model = bigvgan_model.eval().to(device)
         | 
| 50 |  | 
| 51 | 
            +
            ckpt_path, config_path = load_custom_model_from_hf("Plachta/FAcodec", 'pytorch_model.bin', 'config.yml')
         | 
| 52 | 
            +
             | 
| 53 | 
            +
            codec_config = yaml.safe_load(open(config_path))
         | 
| 54 | 
            +
            codec_model_params = recursive_munch(codec_config['model_params'])
         | 
| 55 | 
            +
            codec_encoder = build_model(codec_model_params, stage="codec")
         | 
| 56 | 
            +
             | 
| 57 | 
            +
            ckpt_params = torch.load(ckpt_path, map_location="cpu")
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            for key in codec_encoder:
         | 
| 60 | 
            +
                codec_encoder[key].load_state_dict(ckpt_params[key], strict=False)
         | 
| 61 | 
            +
            _ = [codec_encoder[key].eval() for key in codec_encoder]
         | 
| 62 | 
            +
            _ = [codec_encoder[key].to(device) for key in codec_encoder]
         | 
| 63 | 
            +
             | 
| 64 | 
             
            # whisper
         | 
| 65 | 
             
            from transformers import AutoFeatureExtractor, WhisperModel
         | 
| 66 |  | 
|  | |
| 134 | 
             
            def crossfade(chunk1, chunk2, overlap):
         | 
| 135 | 
             
                fade_out = np.cos(np.linspace(0, np.pi / 2, overlap)) ** 2
         | 
| 136 | 
             
                fade_in = np.cos(np.linspace(np.pi / 2, 0, overlap)) ** 2
         | 
| 137 | 
            +
                chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
         | 
|  | |
|  | |
|  | |
| 138 | 
             
                return chunk2
         | 
| 139 |  | 
| 140 | 
             
            # streaming and chunk processing related params
         | 
|  | |
| 141 | 
             
            bitrate = "320k"
         | 
| 142 | 
            +
            overlap_frame_len = 16
         | 
| 143 | 
             
            @spaces.GPU
         | 
| 144 | 
             
            @torch.no_grad()
         | 
| 145 | 
             
            @torch.inference_mode()
         | 
|  | |
| 243 | 
             
                style2 = campplus_model(feat2.unsqueeze(0))
         | 
| 244 |  | 
| 245 | 
             
                if f0_condition:
         | 
| 246 | 
            +
                    F0_ori = rmvpe.infer_from_audio(ref_waves_16k[0], thred=0.5)
         | 
| 247 | 
            +
                    F0_alt = rmvpe.infer_from_audio(converted_waves_16k[0], thred=0.5)
         | 
| 248 |  | 
| 249 | 
             
                    F0_ori = torch.from_numpy(F0_ori).to(device)[None]
         | 
| 250 | 
             
                    F0_alt = torch.from_numpy(F0_alt).to(device)[None]
         | 
|  | |
| 283 | 
             
                    chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
         | 
| 284 | 
             
                    is_last_chunk = processed_frames + max_source_window >= cond.size(1)
         | 
| 285 | 
             
                    cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
         | 
| 286 | 
            +
                    with torch.autocast(device_type='cuda', dtype=torch.float16):
         | 
| 287 | 
             
                        # Voice Conversion
         | 
| 288 | 
             
                        vc_target = inference_module.cfm.inference(cat_condition,
         | 
| 289 | 
             
                                                                   torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
         | 
|  | |
| 337 |  | 
| 338 |  | 
| 339 | 
             
            if __name__ == "__main__":
         | 
| 340 | 
            +
                description = ("State-of-the-Art zero-shot voice conversion/singing voice conversion. For local deployment please check [GitHub repository](https://github.com/Plachtaa/seed-vc) "
         | 
| 341 | 
             
                               "for details and updates.<br>Note that any reference audio will be forcefully clipped to 25s if beyond this length.<br> "
         | 
| 342 | 
             
                               "If total duration of source and reference audio exceeds 30s, source audio will be processed in chunks.<br> "
         | 
| 343 | 
             
                               "无需训练的 zero-shot 语音/歌声转换模型,若需本地部署查看[GitHub页面](https://github.com/Plachtaa/seed-vc)<br>"
         | 
|  | |
| 345 | 
             
                inputs = [
         | 
| 346 | 
             
                    gr.Audio(type="filepath", label="Source Audio / 源音频"),
         | 
| 347 | 
             
                    gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
         | 
| 348 | 
            +
                    gr.Slider(minimum=1, maximum=200, value=25, step=1, label="Diffusion Steps / 扩散步数", info="25 by default, 50~100 for best quality / 默认为 25,50~100 为最佳质量"),
         | 
| 349 | 
             
                    gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjust / 长度调整", info="<1.0 for speed-up speech, >1.0 for slow-down speech / <1.0 加速语速,>1.0 减慢语速"),
         | 
| 350 | 
             
                    gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Inference CFG Rate", info="has subtle influence / 有微小影响"),
         | 
| 351 | 
             
                    gr.Checkbox(label="Use F0 conditioned model / 启用F0输入", value=False, info="Must set to true for singing voice conversion / 歌声转换时必须勾选"),
         | 
|  | |
| 355 | 
             
                ]
         | 
| 356 |  | 
| 357 | 
             
                examples = [["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, False, True, 0],
         | 
| 358 | 
            +
                            ["examples/source/jay_0.wav", "examples/reference/azuma_0.wav", 25, 1.0, 0.7, False, True, 0],
         | 
| 359 | 
             
                            ["examples/source/Wiz Khalifa,Charlie Puth - See You Again [vocals]_[cut_28sec].wav",
         | 
| 360 | 
            +
                             "examples/reference/kobe_0.wav", 50, 1.0, 0.7, True, False, -6],
         | 
| 361 | 
             
                            ["examples/source/TECHNOPOLIS - 2085 [vocals]_[cut_14sec].wav",
         | 
| 362 | 
            +
                             "examples/reference/trump_0.wav", 50, 1.0, 0.7, True, False, -12],
         | 
| 363 | 
             
                            ]
         | 
| 364 |  | 
| 365 | 
             
                outputs = [gr.Audio(label="Stream Output Audio / 流式输出", streaming=True, format='mp3'),
         | 
