import gradio as gr import io import typing as T import numpy as np from PIL import Image import pydub from scipy.io import wavfile import torch import torchaudio def convert(audio): # read uploaded file to wav rate, data = wavfile.read(audio) # resample from 48000 to 44100 # from scipy.signal import resample # data = resample(data, int(data.shape[0] * 44100 / 48000)) # convert to mono data = np.mean(data, axis=0) # convert to float32 data = data.astype(np.float32) # take a random 7 second slice of the audio data = data[rate*7:rate*14] spectrogram = spectrogram_from_waveform( waveform=data, sample_rate=rate, # width=768, n_fft=8192, hop_length=512, win_length=8192, ) spec = image_from_spectrogram(spectrogram) return spec def spectrogram_from_waveform( waveform: np.ndarray, sample_rate: int, n_fft: int, hop_length: int, win_length: int, mel_scale: bool = True, n_mels: int = 512, ) -> np.ndarray: """ Compute a spectrogram from a waveform. """ spectrogram_func = torchaudio.transforms.Spectrogram( n_fft=n_fft, power=None, hop_length=hop_length, win_length=win_length, ) waveform_tensor = torch.from_numpy(waveform.astype(np.float32)).reshape(1, -1) Sxx_complex = spectrogram_func(waveform_tensor).numpy()[0] Sxx_mag = np.abs(Sxx_complex) if mel_scale: mel_scaler = torchaudio.transforms.MelScale( n_mels=n_mels, sample_rate=sample_rate, f_min=0, f_max=10000, n_stft=n_fft // 2 + 1, norm=None, mel_scale="htk", ) Sxx_mag = mel_scaler(torch.from_numpy(Sxx_mag)).numpy() return Sxx_mag def image_from_spectrogram( spectrogram: np.ndarray, max_volume: float = 50, power_for_image: float = 0.25 ) -> Image.Image: """ Compute a spectrogram image from a spectrogram magnitude array. """ # Apply the power curve data = np.power(spectrogram, power_for_image) # Rescale to 0-255 data = data * 255 / max_volume # Invert data = 255 - data # Convert to a PIL image image = Image.fromarray(data.astype(np.uint8)) # Flip Y image = image.transpose(Image.FLIP_TOP_BOTTOM) # Convert to RGB image = image.convert("RGB") return image gr.Interface(fn=convert, inputs=[gr.Audio(source="upload", type="filepath")], outputs=[gr.Image()]).launch()