Spaces:
Running
Running
import os | |
import re | |
import sys | |
import codecs | |
import librosa | |
import logging | |
import numpy as np | |
import soundfile as sf | |
from pydub import AudioSegment, silence | |
sys.path.append(os.getcwd()) | |
from main.tools import huggingface | |
from main.configs.config import Config | |
for l in ["httpx", "httpcore"]: | |
logging.getLogger(l).setLevel(logging.ERROR) | |
translations = Config().translations | |
def check_predictors(method, f0_onnx=False): | |
if f0_onnx and method not in ["harvestw", "diow"]: method += "-onnx" | |
def download(predictors): | |
if not os.path.exists(os.path.join("assets", "models", "predictors", predictors)): huggingface.HF_download_file(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/cerqvpgbef/", "rot13") + predictors, os.path.join("assets", "models", "predictors", predictors)) | |
model_dict = {**dict.fromkeys(["rmvpe", "rmvpe-legacy"], "rmvpe.pt"), **dict.fromkeys(["rmvpe-onnx", "rmvpe-legacy-onnx"], "rmvpe.onnx"), **dict.fromkeys(["fcpe"], "fcpe.pt"), **dict.fromkeys(["fcpe-legacy"], "fcpe_legacy.pt"), **dict.fromkeys(["fcpe-onnx"], "fcpe.onnx"), **dict.fromkeys(["fcpe-legacy-onnx"], "fcpe_legacy.onnx"), **dict.fromkeys(["crepe-full", "mangio-crepe-full"], "crepe_full.pth"), **dict.fromkeys(["crepe-full-onnx", "mangio-crepe-full-onnx"], "crepe_full.onnx"), **dict.fromkeys(["crepe-large", "mangio-crepe-large"], "crepe_large.pth"), **dict.fromkeys(["crepe-large-onnx", "mangio-crepe-large-onnx"], "crepe_large.onnx"), **dict.fromkeys(["crepe-medium", "mangio-crepe-medium"], "crepe_medium.pth"), **dict.fromkeys(["crepe-medium-onnx", "mangio-crepe-medium-onnx"], "crepe_medium.onnx"), **dict.fromkeys(["crepe-small", "mangio-crepe-small"], "crepe_small.pth"), **dict.fromkeys(["crepe-small-onnx", "mangio-crepe-small-onnx"], "crepe_small.onnx"), **dict.fromkeys(["crepe-tiny", "mangio-crepe-tiny"], "crepe_tiny.pth"), **dict.fromkeys(["crepe-tiny-onnx", "mangio-crepe-tiny-onnx"], "crepe_tiny.onnx"), **dict.fromkeys(["harvestw", "diow"], "world.pth")} | |
if "hybrid" in method: | |
methods_str = re.search("hybrid\[(.+)\]", method) | |
if methods_str: methods = [method.strip() for method in methods_str.group(1).split("+")] | |
for method in methods: | |
if method in model_dict: download(model_dict[method]) | |
elif method in model_dict: download(model_dict[method]) | |
def check_embedders(hubert, embedders_onnx=False): | |
if hubert in ["contentvec_base", "hubert_base", "japanese_hubert_base", "korean_hubert_base", "chinese_hubert_base", "portuguese_hubert_base"]: | |
hubert += ".onnx" if embedders_onnx else ".pt" | |
model_path = os.path.join("assets", "models", "embedders", hubert) | |
if not os.path.exists(model_path): huggingface.HF_download_file(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/rzorqqref/", "rot13") + ("onnx/" if embedders_onnx else "fairseq/") + hubert, model_path) | |
def load_audio(logger, file, sample_rate=16000, formant_shifting=False, formant_qfrency=0.8, formant_timbre=0.8): | |
try: | |
file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") | |
if not os.path.isfile(file): raise FileNotFoundError(translations["not_found"].format(name=file)) | |
try: | |
logger.debug(translations['read_sf']) | |
audio, sr = sf.read(file) | |
except: | |
logger.debug(translations['read_librosa']) | |
audio, sr = librosa.load(file, sr=None) | |
if len(audio.shape) > 1: audio = librosa.to_mono(audio.T) | |
if sr != sample_rate: audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate, res_type="soxr_vhq") | |
if formant_shifting: | |
from main.library.algorithm.stftpitchshift import StftPitchShift | |
pitchshifter = StftPitchShift(1024, 32, sample_rate) | |
audio = pitchshifter.shiftpitch(audio, factors=1, quefrency=formant_qfrency * 1e-3, distortion=formant_timbre) | |
except Exception as e: | |
raise RuntimeError(f"{translations['errors_loading_audio']}: {e}") | |
return audio.flatten() | |
def process_audio(logger, file_path, output_path): | |
try: | |
song = pydub_convert(pydub_load(file_path)) | |
cut_files, time_stamps = [], [] | |
for i, (start_i, end_i) in enumerate(silence.detect_nonsilent(song, min_silence_len=250, silence_thresh=-60)): | |
chunk = song[start_i:end_i] | |
chunk_file_path = os.path.join(output_path, f"chunk{i}.wav") | |
logger.debug(f"{chunk_file_path}: {len(chunk)}") | |
if os.path.exists(chunk_file_path): os.remove(chunk_file_path) | |
chunk.export(chunk_file_path, format="wav") | |
cut_files.append(chunk_file_path) | |
time_stamps.append((start_i, end_i)) | |
logger.info(f"{translations['split_total']}: {len(cut_files)}") | |
return cut_files, time_stamps | |
except Exception as e: | |
raise RuntimeError(f"{translations['process_audio_error']}: {e}") | |
def merge_audio(files_list, time_stamps, original_file_path, output_path, format): | |
try: | |
def extract_number(filename): | |
match = re.search(r'_(\d+)', filename) | |
return int(match.group(1)) if match else 0 | |
total_duration = len(pydub_load(original_file_path)) | |
combined = AudioSegment.empty() | |
current_position = 0 | |
for file, (start_i, end_i) in zip(sorted(files_list, key=extract_number), time_stamps): | |
if start_i > current_position: combined += AudioSegment.silent(duration=start_i - current_position) | |
combined += pydub_load(file) | |
current_position = end_i | |
if current_position < total_duration: combined += AudioSegment.silent(duration=total_duration - current_position) | |
combined.export(output_path, format=format) | |
return output_path | |
except Exception as e: | |
raise RuntimeError(f"{translations['merge_error']}: {e}") | |
def pydub_convert(audio): | |
samples = np.frombuffer(audio.raw_data, dtype=np.int16) | |
if samples.dtype != np.int16: samples = (samples * 32767).astype(np.int16) | |
return AudioSegment(samples.tobytes(), frame_rate=audio.frame_rate, sample_width=samples.dtype.itemsize, channels=audio.channels) | |
def pydub_load(input_path): | |
if input_path.endswith(".wav"): audio = AudioSegment.from_wav(input_path) | |
elif input_path.endswith(".mp3"): audio = AudioSegment.from_mp3(input_path) | |
elif input_path.endswith(".ogg"): audio = AudioSegment.from_ogg(input_path) | |
else: audio = AudioSegment.from_file(input_path) | |
return audio |