Spaces:

CCockrum
/

Tune-Splitter

Sleeping

File size: 8,409 Bytes

52685e3

# pipeline.py

import os
import time
import traceback
import librosa
import torch
import numpy as np
from utils import logger, remove_directory_contents, create_directories
from inference import run_mdx, run_mdx_beta, convert_to_stereo_and_wav, get_hash, random_sleep
from effects import add_vocal_effects, add_instrumental_effects


def process_uvr_task(
    orig_song_path: str,
    main_vocals: bool = False,
    dereverb: bool = True,
    song_id: str = "mdx",
    only_voiceless: bool = False,
    remove_files_output_dir: bool = False,
    mdx_models_dir: str = "mdx_models",
    output_dir: str = "clean_song_output",
):
    device_base = "cuda" if torch.cuda.is_available() else "cpu"
    logger.info(f"Device: {device_base}")

    if remove_files_output_dir:
        remove_directory_contents(output_dir)

    with open(os.path.join(mdx_models_dir, "data.json")) as infile:
        mdx_model_params = json.load(infile)

    song_output_dir = os.path.join(output_dir, song_id)
    create_directories(song_output_dir)
    orig_song_path = convert_to_stereo_and_wav(orig_song_path, output_dir)

    logger.info(f"ONNX Runtime Device >> {ort.get_device()}")

    if only_voiceless:
        logger.info("Voiceless Track Separation...")
        return run_mdx(
            mdx_model_params,
            song_output_dir,
            os.path.join(mdx_models_dir, "UVR-MDX-NET-Inst_HQ_4.onnx"),
            orig_song_path,
            suffix="Voiceless",
            denoise=False,
            keep_orig=True,
            exclude_inversion=True,
            device_base=device_base,
        )

    logger.info("Vocal Track Isolation...")
    vocals_path, instrumentals_path = run_mdx(
        mdx_model_params,
        song_output_dir,
        os.path.join(mdx_models_dir, "UVR-MDX-NET-Voc_FT.onnx"),
        orig_song_path,
        denoise=True,
        keep_orig=True,
        device_base=device_base,
    )

    backup_vocals_path, main_vocals_path = None, vocals_path

    if main_vocals:
        random_sleep()
        try:
            backup_vocals_path, main_vocals_path = run_mdx(
                mdx_model_params,
                song_output_dir,
                os.path.join(mdx_models_dir, "UVR_MDXNET_KARA_2.onnx"),
                vocals_path,
                suffix="Backup",
                invert_suffix="Main",
                denoise=True,
                device_base=device_base,
            )
        except Exception:
            backup_vocals_path, main_vocals_path = run_mdx_beta(
                mdx_model_params,
                song_output_dir,
                os.path.join(mdx_models_dir, "UVR_MDXNET_KARA_2.onnx"),
                vocals_path,
                suffix="Backup",
                invert_suffix="Main",
                denoise=True,
                device_base=device_base,
            )

    vocals_dereverb_path = main_vocals_path
    if dereverb:
        random_sleep()
        try:
            _, vocals_dereverb_path = run_mdx(
                mdx_model_params,
                song_output_dir,
                os.path.join(mdx_models_dir, "Reverb_HQ_By_FoxJoy.onnx"),
                main_vocals_path,
                invert_suffix="DeReverb",
                exclude_main=True,
                denoise=True,
                device_base=device_base,
            )
        except Exception:
            _, vocals_dereverb_path = run_mdx_beta(
                mdx_model_params,
                song_output_dir,
                os.path.join(mdx_models_dir, "Reverb_HQ_By_FoxJoy.onnx"),
                main_vocals_path,
                invert_suffix="DeReverb",
                exclude_main=True,
                denoise=True,
                device_base=device_base,
            )

    return vocals_path, instrumentals_path, backup_vocals_path, main_vocals_path, vocals_dereverb_path


def sound_separate(media_file, stem, main, dereverb,
                   vocal_effects=True, background_effects=True,
                   vocal_reverb_room_size=0.6, vocal_reverb_damping=0.6, vocal_reverb_dryness=0.8, vocal_reverb_wet_level=0.35,
                   vocal_delay_seconds=0.4, vocal_delay_mix=0.25,
                   vocal_compressor_threshold_db=-25, vocal_compressor_ratio=3.5,
                   vocal_compressor_attack_ms=10, vocal_compressor_release_ms=60,
                   vocal_gain_db=4,
                   background_highpass_freq=120, background_lowpass_freq=11000,
                   background_reverb_room_size=0.5, background_reverb_damping=0.5, background_reverb_wet_level=0.25,
                   background_compressor_threshold_db=-20, background_compressor_ratio=2.5,
                   background_compressor_attack_ms=15, background_compressor_release_ms=80,
                   background_gain_db=3):

    if not media_file:
        raise ValueError("The audio path is missing.")
    if not stem:
        raise ValueError("Please select 'vocal' or 'background' stem.")

    hash_audio = str(get_hash(media_file))
    media_dir = os.path.dirname(media_file)
    outputs = []

    start_time = time.time()

    try:
        librosa.get_duration(filename=media_file)
    except Exception as e:
        print(e)

    if stem == "vocal":
        try:
            _, _, _, _, vocal_audio = process_uvr_task(
                orig_song_path=media_file,
                song_id=hash_audio + "mdx",
                main_vocals=main,
                dereverb=dereverb,
                remove_files_output_dir=False,
            )

            if vocal_effects:
                file_name, file_extension = os.path.splitext(os.path.abspath(vocal_audio))
                out_effects_path = os.path.join(media_dir, f"{file_name}_effects{file_extension}")
                add_vocal_effects(vocal_audio, out_effects_path,
                                  reverb_room_size=vocal_reverb_room_size,
                                  reverb_damping=vocal_reverb_damping,
                                  vocal_reverb_dryness=vocal_reverb_dryness,
                                  reverb_wet_level=vocal_reverb_wet_level,
                                  delay_seconds=vocal_delay_seconds,
                                  delay_mix=vocal_delay_mix,
                                  compressor_threshold_db=vocal_compressor_threshold_db,
                                  compressor_ratio=vocal_compressor_ratio,
                                  compressor_attack_ms=vocal_compressor_attack_ms,
                                  compressor_release_ms=vocal_compressor_release_ms,
                                  gain_db=vocal_gain_db)
                vocal_audio = out_effects_path

            outputs.append(vocal_audio)

        except Exception as error:
            logger.error(str(error))
            traceback.print_exc()

    if stem == "background":
        background_audio, _ = process_uvr_task(
            orig_song_path=media_file,
            song_id=hash_audio + "voiceless",
            only_voiceless=True,
            remove_files_output_dir=False,
        )

        if background_effects:
            file_name, file_extension = os.path.splitext(os.path.abspath(background_audio))
            out_effects_path = os.path.join(media_dir, f"{file_name}_effects{file_extension}")
            add_instrumental_effects(background_audio, out_effects_path,
                                     highpass_freq=background_highpass_freq,
                                     lowpass_freq=background_lowpass_freq,
                                     reverb_room_size=background_reverb_room_size,
                                     reverb_damping=background_reverb_damping,
                                     reverb_wet_level=background_reverb_wet_level,
                                     compressor_threshold_db=background_compressor_threshold_db,
                                     compressor_ratio=background_compressor_ratio,
                                     compressor_attack_ms=background_compressor_attack_ms,
                                     compressor_release_ms=background_compressor_release_ms,
                                     gain_db=background_gain_db)
            background_audio = out_effects_path

        outputs.append(background_audio)

    logger.info(f"Execution time: {time.time() - start_time:.2f} seconds")

    if not outputs:
        raise Exception("Error in sound separation.")

    return outputs