Spaces:
Running
Running
| import os | |
| import torch | |
| import random | |
| import shutil | |
| import librosa | |
| import warnings | |
| import numpy as np | |
| import gradio as gr | |
| import librosa.display | |
| import matplotlib.pyplot as plt | |
| from utils import get_modelist, find_wav_files, embed_img, TEMP_DIR | |
| from model import EvalNet | |
| TRANSLATE = { | |
| "vibrato": "揉弦 Rou xian", | |
| "trill": "颤音 Chan yin", | |
| "tremolo": "颤弓 Chan gong", | |
| "staccato": "顿弓 Dun gong", | |
| "ricochet": "抛弓 Pao gong", | |
| "pizzicato": "拨弦 Bo xian", | |
| "percussive": "击弓 Ji gong", | |
| "legato_slide_glissando": "连滑音 Lian hua yin", | |
| "harmonic": "泛音 Fan yin", | |
| "diangong": "垫弓 Dian gong", | |
| "detache": "分弓 Fen gong", | |
| } | |
| CLASSES = list(TRANSLATE.keys()) | |
| SAMPLE_RATE = 44100 | |
| def circular_padding(y: np.ndarray, sr: int, dur=3): | |
| if len(y) >= sr * dur: | |
| return y[: sr * dur] | |
| size = sr * dur // len(y) + int((sr * dur) % len(y) > 0) | |
| arrays = [] | |
| for _ in range(size): | |
| arrays.append(y) | |
| y = np.hstack(arrays) | |
| return y[: sr * dur] | |
| def wav2mel(audio_path: str): | |
| os.makedirs(TEMP_DIR, exist_ok=True) | |
| try: | |
| y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) | |
| y = circular_padding(y, sr) | |
| mel_spec = librosa.feature.melspectrogram(y=y, sr=sr) | |
| log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) | |
| librosa.display.specshow(log_mel_spec) | |
| plt.axis("off") | |
| plt.savefig( | |
| f"{TEMP_DIR}/output.jpg", | |
| bbox_inches="tight", | |
| pad_inches=0.0, | |
| ) | |
| plt.close() | |
| except Exception as e: | |
| print(f"Error converting {audio_path} : {e}") | |
| def wav2cqt(audio_path: str): | |
| os.makedirs(TEMP_DIR, exist_ok=True) | |
| try: | |
| y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) | |
| y = circular_padding(y, sr) | |
| cqt_spec = librosa.cqt(y=y, sr=sr) | |
| log_cqt_spec = librosa.power_to_db(np.abs(cqt_spec) ** 2, ref=np.max) | |
| librosa.display.specshow(log_cqt_spec) | |
| plt.axis("off") | |
| plt.savefig( | |
| f"{TEMP_DIR}/output.jpg", | |
| bbox_inches="tight", | |
| pad_inches=0.0, | |
| ) | |
| plt.close() | |
| except Exception as e: | |
| print(f"Error converting {audio_path} : {e}") | |
| def wav2chroma(audio_path: str): | |
| os.makedirs(TEMP_DIR, exist_ok=True) | |
| try: | |
| y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) | |
| y = circular_padding(y, sr) | |
| chroma_spec = librosa.feature.chroma_stft(y=y, sr=sr) | |
| log_chroma_spec = librosa.power_to_db(np.abs(chroma_spec) ** 2, ref=np.max) | |
| librosa.display.specshow(log_chroma_spec) | |
| plt.axis("off") | |
| plt.savefig( | |
| f"{TEMP_DIR}/output.jpg", | |
| bbox_inches="tight", | |
| pad_inches=0.0, | |
| ) | |
| plt.close() | |
| except Exception as e: | |
| print(f"Error converting {audio_path} : {e}") | |
| def infer(wav_path: str, log_name: str, folder_path=TEMP_DIR): | |
| if os.path.exists(folder_path): | |
| shutil.rmtree(folder_path) | |
| if not wav_path: | |
| return None, "请输入音频 Please input an audio!" | |
| try: | |
| model = EvalNet(log_name, len(TRANSLATE)).model | |
| except Exception as e: | |
| return None, f"{e}" | |
| spec = log_name.split("_")[-3] | |
| eval("wav2%s" % spec)(wav_path) | |
| input = embed_img(f"{folder_path}/output.jpg") | |
| output: torch.Tensor = model(input) | |
| pred_id = torch.max(output.data, 1)[1] | |
| return ( | |
| os.path.basename(wav_path), | |
| f"{TRANSLATE[CLASSES[pred_id]]} ({CLASSES[pred_id].capitalize()})", | |
| ) | |
| if __name__ == "__main__": | |
| warnings.filterwarnings("ignore") | |
| models = get_modelist() | |
| examples = [] | |
| example_wavs = find_wav_files() | |
| model_num = len(models) | |
| for wav in example_wavs: | |
| examples.append([wav, models[random.randint(0, model_num - 1)]]) | |
| with gr.Blocks() as demo: | |
| gr.Interface( | |
| fn=infer, | |
| inputs=[ | |
| gr.Audio(label="上传录音 Upload a recording", type="filepath"), | |
| gr.Dropdown( | |
| choices=models, label="选择模型 Select a model", value=models[0] | |
| ), | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="音频文件名 Audio filename", show_copy_button=True), | |
| gr.Textbox( | |
| label="演奏技法识别 Playing tech recognition", show_copy_button=True | |
| ), | |
| ], | |
| examples=examples, | |
| cache_examples=False, | |
| allow_flagging="never", | |
| title="建议录音时长保持在 3s 左右<br>It is recommended to keep the recording length around 3s.", | |
| ) | |
| gr.Markdown( | |
| """ | |
| # 引用 Cite | |
| ```bibtex | |
| @dataset{zhaorui_liu_2021_5676893, | |
| author = {Monan Zhou, Shenyang Xu, Zhaorui Liu, Zhaowen Wang, Feng Yu, Wei Li and Baoqiang Han}, | |
| title = {CCMusic: an Open and Diverse Database for Chinese and General Music Information Retrieval Research}, | |
| month = {mar}, | |
| year = {2024}, | |
| publisher = {HuggingFace}, | |
| version = {1.2}, | |
| url = {https://huggingface.co/ccmusic-database} | |
| } | |
| ```""" | |
| ) | |
| demo.launch() | |