Spaces:

gradio
/

musical_instrument_identification_main

Runtime error

App Files Files Community

aliabd HF Staff commited on Oct 26, 2022

Commit

a483498

1 Parent(s): 69f86da

Upload with huggingface_hub

Browse files

Files changed (5) hide show

DESCRIPTION.md +1 -0
README.md +6 -7
data_setups.py +80 -0
requirements.txt +6 -0
run.py +50 -0

DESCRIPTION.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ This demo identifies musical instruments from an audio file. It uses Gradio's Audio and Label components.

README.md CHANGED Viewed

@@ -1,12 +1,11 @@
 ---
-title: Musical Instrument Identification Main
-emoji: 📚
-colorFrom: purple
-colorTo: purple
 sdk: gradio
 sdk_version: 3.6
-app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: musical_instrument_identification_main
+emoji: 🔥
+colorFrom: indigo
+colorTo: indigo
 sdk: gradio
 sdk_version: 3.6
+app_file: run.py
 pinned: false
 ---

data_setups.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Make function to find classes in target directory
+import os
+import librosa
+import torch
+import numpy as np
+from torchaudio.transforms import Resample
+SAMPLE_RATE = 44100
+AUDIO_LEN = 2.90
+# Parameters to control the MelSpec generation
+N_MELS = 128
+F_MIN = 20
+F_MAX = 16000
+N_FFT = 1024
+HOP_LEN = 512
+# Make function to find classes in target directory
+def find_classes(directory: str):
+    # 1. Get the class names by scanning the target directory
+    classes = sorted(entry.name for entry in os.scandir(directory) if entry.is_dir())
+    # 2. Raise an error if class names not found
+    if not classes:
+        raise FileNotFoundError(f"Couldn't find any classes in {directory}.")
+    # 3. Crearte a dictionary of index labels (computers prefer numerical rather than string labels)
+    class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
+    return classes, class_to_idx
+def resample(wav, sample_rate, new_sample_rate):
+    if wav.shape[0] >= 2:
+        wav = torch.mean(wav, dim=0)
+    else:
+        wav = wav.squeeze(0)
+    if sample_rate > new_sample_rate:
+        resampler = Resample(sample_rate, new_sample_rate)
+        wav = resampler(wav)
+    return wav
+def mono_to_color(X, eps=1e-6, mean=None, std=None):
+    X = np.stack([X, X, X], axis=-1)
+    # Standardize
+    mean = mean or X.mean()
+    std = std or X.std()
+    X = (X - mean) / (std + eps)
+    # Normalize to [0, 255]
+    _min, _max = X.min(), X.max()
+    if (_max - _min) > eps:
+        V = np.clip(X, _min, _max)
+        V = 255 * (V - _min) / (_max - _min)
+        V = V.astype(np.uint8)
+    else:
+        V = np.zeros_like(X, dtype=np.uint8)
+    return V
+def normalize(image, mean=None, std=None):
+    image = image / 255.0
+    if mean is not None and std is not None:
+        image = (image - mean) / std
+    return np.moveaxis(image, 2, 0).astype(np.float32)
+def compute_melspec(wav, sample_rate=SAMPLE_RATE):
+    melspec = librosa.feature.melspectrogram(
+        y=wav,
+        sr=sample_rate,
+        n_fft=N_FFT,
+        fmin=F_MIN,
+        fmax=F_MAX,
+        n_mels=N_MELS,
+        hop_length=HOP_LEN
+    )
+    melspec = librosa.power_to_db(melspec).astype(np.float32)
+    return melspec
+def audio_preprocess(wav, sample_rate):
+    wav = wav.numpy()
+    melspec = compute_melspec(wav, sample_rate)
+    image = mono_to_color(melspec)
+    image = normalize(image, mean=None, std=None)
+    image = torch.from_numpy(image)
+    return image

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch==1.12.0
+torchvision==0.13.0
+torchaudio==0.12.0
+gradio==3.1.4
+librosa==0.9.2
+gdownhttps://gradio-main-build.s3.amazonaws.com/c3bec6153737855510542e8154391f328ac72606/gradio-3.6-py3-none-any.whl

run.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import gradio as gr
+import torch, torchaudio
+from timeit import default_timer as timer
+from data_setups import audio_preprocess, resample
+import gdown
+url = 'https://drive.google.com/uc?id=1X5CR18u0I-ZOi_8P0cNptCe5JGk9Ro0C'
+output = 'piano.wav'
+gdown.download(url, output, quiet=False)
+url = 'https://drive.google.com/uc?id=1W-8HwmGR5SiyDbUcGAZYYDKdCIst07__'
+output= 'torch_efficientnet_fold2_CNN.pth'
+gdown.download(url, output, quiet=False)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+SAMPLE_RATE = 44100
+AUDIO_LEN = 2.90
+model = torch.load("torch_efficientnet_fold2_CNN.pth", map_location=torch.device('cpu'))
+LABELS = [
+    "Cello", "Clarinet", "Flute", "Acoustic Guitar", "Electric Guitar", "Organ", "Piano", "Saxophone", "Trumpet", "Violin", "Voice"
+]
+example_list = [
+    ["piano.wav"]
+]
+def predict(audio_path):
+    start_time = timer()
+    wavform, sample_rate = torchaudio.load(audio_path)
+    wav = resample(wavform, sample_rate, SAMPLE_RATE)
+    if len(wav) > int(AUDIO_LEN * SAMPLE_RATE):
+        wav = wav[:int(AUDIO_LEN * SAMPLE_RATE)]
+    else:
+        print(f"input length {len(wav)} too small!, need over {int(AUDIO_LEN * SAMPLE_RATE)}")
+        return
+    img = audio_preprocess(wav, SAMPLE_RATE).unsqueeze(0)
+    model.eval()
+    with torch.inference_mode():
+        pred_probs = torch.softmax(model(img), dim=1)
+    pred_labels_and_probs = {LABELS[i]: float(pred_probs[0][i]) for i in range(len(LABELS))}
+    pred_time = round(timer() - start_time, 5)
+    return pred_labels_and_probs, pred_time
+demo = gr.Interface(fn=predict,
+                    inputs=gr.Audio(type="filepath"),
+                    outputs=[gr.Label(num_top_classes=11, label="Predictions"),
+                             gr.Number(label="Prediction time (s)")],
+                    examples=example_list,
+                    cache_examples=False
+                    )
+demo.launch(debug=False)