Spaces:
Runtime error
Runtime error
from pathlib import Path | |
import librosa | |
import torch | |
from huggingface_hub import hf_hub_download | |
from .models.s3tokenizer import S3_SR | |
from .models.s3gen import S3GEN_SR, S3Gen | |
REPO_ID = "ResembleAI/Orator" | |
class OratorVC: | |
ENC_COND_LEN = 6 * S3_SR | |
DEC_COND_LEN = 10 * S3GEN_SR | |
def __init__( | |
self, | |
s3gen: S3Gen, | |
device: str, | |
ref_dict: dict=None, | |
): | |
self.sr = S3GEN_SR | |
self.s3gen = s3gen | |
self.device = device | |
if ref_dict is None: | |
self.ref_dict = None | |
else: | |
self.ref_dict = { | |
k: v.to(device) if torch.is_tensor(v) else v | |
for k, v in ref_dict.items() | |
} | |
def from_local(cls, ckpt_dir, device) -> 'OratorVC': | |
ckpt_dir = Path(ckpt_dir) | |
ref_dict = None | |
if (builtin_voice := ckpt_dir / "conds.pt").exists(): | |
states = torch.load(builtin_voice) | |
ref_dict = states['gen'] | |
s3gen = S3Gen() | |
s3gen.load_state_dict( | |
torch.load(ckpt_dir / "s3gen.pt") | |
) | |
s3gen.to(device).eval() | |
return cls(s3gen, device, ref_dict=ref_dict) | |
def from_pretrained(cls, device) -> 'OratorVC': | |
for fpath in ["s3gen.pt", "conds.pt"]: | |
local_path = hf_hub_download(repo_id=REPO_ID, filename=fpath) | |
return cls.from_local(Path(local_path).parent, device) | |
def set_target_voice(self, wav_fpath): | |
## Load reference wav | |
s3gen_ref_wav, _sr = librosa.load(wav_fpath, sr=S3GEN_SR) | |
s3gen_ref_wav = s3gen_ref_wav[:self.DEC_COND_LEN] | |
self.ref_dict = self.s3gen.embed_ref(s3gen_ref_wav, S3GEN_SR, device=self.device) | |
def generate( | |
self, | |
audio, | |
target_voice_path=None, | |
): | |
if target_voice_path: | |
self.set_target_voice(target_voice_path) | |
else: | |
assert self.ref_dict is not None, "Please `prepare_conditionals` first or specify `target_voice_path`" | |
with torch.inference_mode(): | |
if isinstance(audio, str): | |
import torchaudio as ta | |
audio_16, _ = librosa.load(audio, sr=S3_SR) | |
audio_16 = torch.from_numpy(audio_16).float().to(self.device)[None, ] | |
else: | |
raise NotImplementedError() | |
s3_tokens, _ = self.s3gen.tokenizer(audio_16) | |
wav, _ = self.s3gen.inference( | |
speech_tokens=s3_tokens, | |
ref_dict=self.ref_dict, | |
) | |
return wav.detach().cpu() | |