Update app.py
Browse files
app.py
CHANGED
|
@@ -77,8 +77,8 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
| 77 |
ref_audio = librosa.load(target, sr=sr)[0]
|
| 78 |
|
| 79 |
# Process audio
|
| 80 |
-
source_audio = torch.tensor(source_audio[:sr * 30]).unsqueeze(0).float()
|
| 81 |
-
ref_audio = torch.tensor(ref_audio[:sr * 30]).unsqueeze(0).float()
|
| 82 |
|
| 83 |
# Resample
|
| 84 |
source_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
|
|
@@ -88,8 +88,8 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
|
|
| 88 |
S_alt = cosyvoice_frontend.extract_speech_token(source_waves_16k)[0]
|
| 89 |
S_ori = cosyvoice_frontend.extract_speech_token(ref_waves_16k)[0]
|
| 90 |
|
| 91 |
-
mel = to_mel(source_audio.float())
|
| 92 |
-
mel2 = to_mel(ref_audio.float())
|
| 93 |
|
| 94 |
target_lengths = torch.LongTensor([int(mel.size(2) * length_adjust)]).to(mel.device)
|
| 95 |
target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
|
|
|
|
| 77 |
ref_audio = librosa.load(target, sr=sr)[0]
|
| 78 |
|
| 79 |
# Process audio
|
| 80 |
+
source_audio = torch.tensor(source_audio[:sr * 30]).unsqueeze(0).float().to(device)
|
| 81 |
+
ref_audio = torch.tensor(ref_audio[:sr * 30]).unsqueeze(0).float().to(device)
|
| 82 |
|
| 83 |
# Resample
|
| 84 |
source_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
|
|
|
|
| 88 |
S_alt = cosyvoice_frontend.extract_speech_token(source_waves_16k)[0]
|
| 89 |
S_ori = cosyvoice_frontend.extract_speech_token(ref_waves_16k)[0]
|
| 90 |
|
| 91 |
+
mel = to_mel(source_audio.to(device).float())
|
| 92 |
+
mel2 = to_mel(ref_audio.to(device).float())
|
| 93 |
|
| 94 |
target_lengths = torch.LongTensor([int(mel.size(2) * length_adjust)]).to(mel.device)
|
| 95 |
target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
|