Spaces:
Runtime error
Runtime error
File size: 2,588 Bytes
c7e9d11 112c36b c7e9d11 112c36b c7e9d11 112c36b c7e9d11 112c36b c7e9d11 112c36b c7e9d11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
from pathlib import Path
import librosa
import torch
from huggingface_hub import hf_hub_download
from .models.s3tokenizer import S3_SR
from .models.s3gen import S3GEN_SR, S3Gen
REPO_ID = "ResembleAI/Orator"
class OratorVC:
ENC_COND_LEN = 6 * S3_SR
DEC_COND_LEN = 10 * S3GEN_SR
def __init__(
self,
s3gen: S3Gen,
device: str,
ref_dict: dict=None,
):
self.sr = S3GEN_SR
self.s3gen = s3gen
self.device = device
if ref_dict is None:
self.ref_dict = None
else:
self.ref_dict = {
k: v.to(device) if torch.is_tensor(v) else v
for k, v in ref_dict.items()
}
@classmethod
def from_local(cls, ckpt_dir, device) -> 'OratorVC':
ckpt_dir = Path(ckpt_dir)
ref_dict = None
if (builtin_voice := ckpt_dir / "conds.pt").exists():
states = torch.load(builtin_voice)
ref_dict = states['gen']
s3gen = S3Gen()
s3gen.load_state_dict(
torch.load(ckpt_dir / "s3gen.pt")
)
s3gen.to(device).eval()
return cls(s3gen, device, ref_dict=ref_dict)
@classmethod
def from_pretrained(cls, device) -> 'OratorVC':
for fpath in ["s3gen.pt", "conds.pt"]:
local_path = hf_hub_download(repo_id=REPO_ID, filename=fpath)
return cls.from_local(Path(local_path).parent, device)
def set_target_voice(self, wav_fpath):
## Load reference wav
s3gen_ref_wav, _sr = librosa.load(wav_fpath, sr=S3GEN_SR)
s3gen_ref_wav = s3gen_ref_wav[:self.DEC_COND_LEN]
self.ref_dict = self.s3gen.embed_ref(s3gen_ref_wav, S3GEN_SR, device=self.device)
def generate(
self,
audio,
target_voice_path=None,
):
if target_voice_path:
self.set_target_voice(target_voice_path)
else:
assert self.ref_dict is not None, "Please `prepare_conditionals` first or specify `target_voice_path`"
with torch.inference_mode():
if isinstance(audio, str):
import torchaudio as ta
audio_16, _ = librosa.load(audio, sr=S3_SR)
audio_16 = torch.from_numpy(audio_16).float().to(self.device)[None, ]
else:
raise NotImplementedError()
s3_tokens, _ = self.s3gen.tokenizer(audio_16)
wav, _ = self.s3gen.inference(
speech_tokens=s3_tokens,
ref_dict=self.ref_dict,
)
return wav.detach().cpu()
|