|
|
|
|
|
import os |
|
from pathlib import Path |
|
from typing import Final |
|
|
|
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
import numpy as np |
|
import librosa |
|
import audioread |
|
from piano_transcription_inference import utilities |
|
|
|
|
|
|
|
|
|
MODEL_NAME: Final[str] = "CRNN_note_F1=0.9677_pedal_F1=0.9186.pth" |
|
REPO_ID: Final[str] = "Genius-Society/piano_trans" |
|
|
|
|
|
|
|
|
|
def download_model_from_hf_if_needed(): |
|
""" |
|
Checks for the model and downloads it from the Hugging Face Hub if not present. |
|
The hf_hub_download function handles caching and existence checks automatically. |
|
""" |
|
|
|
utils_dir = Path(__file__).parent |
|
base_dir = utils_dir.parent |
|
model_dir = base_dir / "models" |
|
model_path = model_dir / MODEL_NAME |
|
|
|
print(f"Checking for model '{MODEL_NAME}' from Hugging Face Hub repo '{REPO_ID}'...") |
|
|
|
try: |
|
|
|
|
|
|
|
hf_hub_download( |
|
repo_id=REPO_ID, |
|
filename=MODEL_NAME, |
|
local_dir=model_dir, |
|
|
|
|
|
) |
|
print(f"Model is available at '{model_path}'") |
|
|
|
except AttributeError as e: |
|
print(f"Error downloading from Hugging Face Hub. Please check your network connection and the repo/filename.") |
|
print(f"Details: {e}") |
|
|
|
|
|
except Exception as e: |
|
print(f"An unexpected error occurred: {e}") |
|
|
|
|
|
|
|
|
|
|
|
def _fixed_load_audio(path, sr=22050, mono=True, offset=0.0, duration=None, |
|
dtype=np.float32, res_type='kaiser_best', |
|
backends=[audioread.ffdec.FFmpegAudioFile]): |
|
""" |
|
A patched version of load_audio that uses updated function paths |
|
for newer librosa versions. This function is intended to replace the |
|
original one in the `piano_transcription_inference` library. |
|
""" |
|
|
|
y = [] |
|
with audioread.audio_open(os.path.realpath(path), backends=backends) as input_file: |
|
sr_native = input_file.samplerate |
|
n_channels = input_file.channels |
|
s_start = int(np.round(sr_native * offset)) * n_channels |
|
if duration is None: |
|
s_end = np.inf |
|
else: |
|
s_end = s_start + (int(np.round(sr_native * duration)) * n_channels) |
|
n = 0 |
|
for frame in input_file: |
|
frame = librosa.util.buf_to_float(frame, dtype=dtype) |
|
n_prev = n |
|
n = n + len(frame) |
|
if n < s_start: |
|
continue |
|
if s_end < n_prev: |
|
break |
|
if s_end < n: |
|
frame = frame[:s_end - n_prev] |
|
if n_prev <= s_start <= n: |
|
frame = frame[(s_start - n_prev):] |
|
y.append(frame) |
|
if y: |
|
y = np.concatenate(y) |
|
if n_channels > 1: |
|
y = y.reshape((-1, n_channels)).T |
|
if mono: |
|
y = librosa.to_mono(y) |
|
if sr is not None: |
|
y = librosa.resample(y, orig_sr=sr_native, target_sr=sr, res_type=res_type) |
|
else: |
|
sr = sr_native |
|
y = np.ascontiguousarray(y, dtype=dtype) |
|
return (y, sr) |
|
|
|
|
|
def apply_monkey_patch(): |
|
""" |
|
Applies the patch to the `piano_transcription_inference` library by |
|
replacing its `load_audio` function with our fixed version. |
|
""" |
|
print("Applying librosa compatibility patch...") |
|
utilities.load_audio = _fixed_load_audio |
|
|
|
|
|
|
|
|
|
def initialize_app(): |
|
""" |
|
Main initialization function. Call this at the start of your app. |
|
It downloads the model from Hugging Face and applies the necessary patches. |
|
""" |
|
print("--- Initializing Application ---") |
|
download_model_from_hf_if_needed() |
|
apply_monkey_patch() |
|
print("--- Initialization Complete ---") |
|
|