|
r'''#=================================================================================================================== |
|
# |
|
# MIDI to Colab AUdio Python Module |
|
# |
|
# Converts any MIDI file to raw audio which is compatible |
|
# with Google Colab or HUgging Face Gradio |
|
# |
|
# Version 2.0 |
|
# |
|
# Includes full source code of MIDI and pyfluidsynth |
|
# |
|
# Original source code for all modules was retrieved on 07/31/2025 |
|
# |
|
# Project Los Angeles |
|
# Tegridy Code 2025 |
|
# |
|
#=================================================================================================================== |
|
# |
|
# Critical dependencies |
|
# |
|
# pip install numpy |
|
# sudo apt install fluidsynth |
|
# |
|
#=================================================================================================================== |
|
# |
|
# Example usage: |
|
# |
|
# from midi_to_colab_audio import midi_to_colab_audio |
|
# from IPython.display import display, Audio |
|
# |
|
# raw_audio = midi_to_colab_audio('/content/input.mid') |
|
# |
|
# display(Audio(raw_audio, rate=16000, normalize=False)) |
|
# |
|
#=================================================================================================================== |
|
''' |
|
|
|
import fluidsynth |
|
from src import MIDI |
|
|
|
|
|
|
|
import numpy as np |
|
import wave |
|
|
|
|
|
|
|
def normalize_audio(audio: np.ndarray, |
|
method: str = 'peak', |
|
target_level_db: float = -1.0, |
|
per_channel: bool = False, |
|
eps: float = 1e-9 |
|
) -> np.ndarray: |
|
|
|
""" |
|
Normalize audio to a target dBFS level. |
|
|
|
Parameters |
|
---------- |
|
audio : np.ndarray |
|
Float-valued array in range [-1, 1] with shape (channels, samples) |
|
or (samples,) for mono. |
|
method : {'peak', 'rms'} |
|
- 'peak': scale so that max(|audio|) = target_level_lin |
|
- 'rms' : scale so that RMS(audio) = target_level_lin |
|
target_level_db : float |
|
Desired output level, in dBFS (0 dBFS = max digital full scale). |
|
e.g. -1.0 dBFS means ~0.8913 linear gain. |
|
per_channel : bool |
|
If True, normalize each channel independently. Otherwise, use a |
|
global measure across all channels. |
|
eps : float |
|
Small constant to avoid division by zero. |
|
|
|
Returns |
|
------- |
|
normalized : np.ndarray |
|
Audio array of same shape, scaled so that levels meet the target. |
|
""" |
|
|
|
|
|
target_lin = 10 ** (target_level_db / 20.0) |
|
|
|
|
|
audio = audio.astype(np.float32) |
|
|
|
|
|
if audio.ndim == 1: |
|
audio = audio[np.newaxis, :] |
|
|
|
|
|
axis = 1 if per_channel else None |
|
|
|
if method == 'peak': |
|
|
|
peak = np.max(np.abs(audio), axis=axis, keepdims=True) |
|
peak = np.maximum(peak, eps) |
|
scales = target_lin / peak |
|
|
|
elif method == 'rms': |
|
|
|
rms = np.sqrt(np.mean(audio ** 2, axis=axis, keepdims=True)) |
|
rms = np.maximum(rms, eps) |
|
scales = target_lin / rms |
|
|
|
else: |
|
raise ValueError(f"Unsupported method '{method}'; choose 'peak' or 'rms'.") |
|
|
|
|
|
normalized = audio * scales |
|
|
|
|
|
return np.clip(normalized, -1.0, 1.0) |
|
|
|
|
|
|
|
def midi_opus_to_colab_audio(midi_opus, |
|
soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2', |
|
sample_rate=16000, |
|
volume_level_db=-1, |
|
trim_silence=True, |
|
silence_threshold=0.1, |
|
output_for_gradio=False, |
|
write_audio_to_WAV='' |
|
): |
|
|
|
if midi_opus[1]: |
|
|
|
ticks_per_beat, *tracks = midi_opus |
|
if not tracks: |
|
return None |
|
|
|
|
|
events = [] |
|
for track in tracks: |
|
abs_t = 0 |
|
for name, dt, *data in track: |
|
abs_t += dt |
|
events.append([name, abs_t, *data]) |
|
events.sort(key=lambda e: e[1]) |
|
|
|
|
|
fl = fluidsynth.Synth(samplerate=float(sample_rate)) |
|
sfid = fl.sfload(soundfont_path) |
|
for chan in range(16): |
|
|
|
fl.program_select(chan, sfid, 128 if chan == 9 else 0, 0) |
|
|
|
|
|
tempo = int((60 / 120) * 1e6) |
|
last_t = 0 |
|
ss = np.empty((0, 2), dtype=np.int16) |
|
|
|
for name, cur_t, *data in events: |
|
|
|
delta_ticks = cur_t - last_t |
|
last_t = cur_t |
|
dt_seconds = (delta_ticks / ticks_per_beat) * (tempo / 1e6) |
|
sample_len = int(dt_seconds * sample_rate) |
|
if sample_len > 0: |
|
buf = fl.get_samples(sample_len).reshape(-1, 2) |
|
ss = np.concatenate([ss, buf], axis=0) |
|
|
|
|
|
if name == "note_on" and data[2] > 0: |
|
chan, note, vel = data |
|
fl.noteon(chan, note, vel) |
|
|
|
elif name == "note_off" or (name == "note_on" and data[2] == 0): |
|
chan, note = data[:2] |
|
fl.noteoff(chan, note) |
|
|
|
elif name == "patch_change": |
|
chan, patch = data[:2] |
|
bank = 128 if chan == 9 else 0 |
|
fl.program_select(chan, sfid, bank, patch) |
|
|
|
elif name == "control_change": |
|
chan, ctrl, val = data[:3] |
|
fl.cc(chan, ctrl, val) |
|
|
|
elif name == "key_after_touch": |
|
chan, note, vel = data |
|
fl.key_pressure(chan, note, vel) |
|
|
|
elif name == "channel_after_touch": |
|
chan, vel = data |
|
fl.channel_pressure(chan, vel) |
|
|
|
elif name == "pitch_wheel_change": |
|
chan, wheel = data |
|
fl.pitch_bend(chan, wheel) |
|
|
|
elif name == "song_position": |
|
|
|
pass |
|
|
|
elif name == "song_select": |
|
|
|
pass |
|
|
|
elif name == "tune_request": |
|
|
|
pass |
|
|
|
elif name in ("sysex_f0", "sysex_f7"): |
|
raw_bytes = data[0] |
|
fl.sysex(raw_bytes) |
|
|
|
|
|
elif name in ( |
|
"set_tempo", |
|
"end_track", |
|
"text_event", "text_event_08", "text_event_09", "text_event_0a", |
|
"text_event_0b", "text_event_0c", "text_event_0d", "text_event_0e", "text_event_0f", |
|
"copyright_text_event", "track_name", "instrument_name", |
|
"lyric", "marker", "cue_point", |
|
"smpte_offset", "time_signature", "key_signature", |
|
"sequencer_specific", "raw_meta_event" |
|
): |
|
if name == "set_tempo": |
|
tempo = data[0] |
|
|
|
continue |
|
|
|
else: |
|
|
|
continue |
|
|
|
|
|
fl.delete() |
|
|
|
if ss.size: |
|
maxv = np.abs(ss).max() |
|
if maxv: |
|
ss = (ss / maxv) * np.iinfo(np.int16).max |
|
ss = ss.astype(np.int16) |
|
|
|
|
|
if trim_silence and ss.size: |
|
thresh = np.std(np.abs(ss)) * silence_threshold |
|
idx = np.where(np.abs(ss) > thresh)[0] |
|
if idx.size: |
|
ss = ss[: idx[-1] + 1] |
|
|
|
|
|
if output_for_gradio: |
|
return ss |
|
|
|
|
|
ss = ss.T |
|
raw_audio = normalize_audio(ss, target_level_db=volume_level_db) |
|
|
|
|
|
if write_audio_to_WAV: |
|
wav_name = midi_file.rsplit('.', 1)[0] + '.wav' |
|
pcm = np.int16(raw_audio.T / np.max(np.abs(raw_audio)) * 32767) |
|
with wave.open(wav_name, 'wb') as wf: |
|
wf.setframerate(sample_rate) |
|
wf.setsampwidth(2) |
|
wf.setnchannels(pcm.shape[1]) |
|
wf.writeframes(pcm.tobytes()) |
|
|
|
return raw_audio |
|
|
|
else: |
|
return None |
|
|
|
|
|
|
|
def midi_to_colab_audio(midi_file, |
|
soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2', |
|
sample_rate=16000, |
|
volume_level_db=-1, |
|
trim_silence=True, |
|
silence_threshold=0.1, |
|
output_for_gradio=False, |
|
write_audio_to_WAV=False |
|
): |
|
""" |
|
Returns raw audio to pass to IPython.disaply.Audio func |
|
|
|
Example usage: |
|
|
|
from IPython.display import Audio |
|
|
|
display(Audio(raw_audio, rate=16000, normalize=False)) |
|
""" |
|
|
|
|
|
if isinstance(midi_file, str): |
|
|
|
try: |
|
with open(midi_file, 'rb') as f: |
|
midi_bytes = f.read() |
|
except FileNotFoundError: |
|
print(f"Error: Could not find or open the file at {midi_file}") |
|
return None |
|
elif isinstance(midi_file, bytes): |
|
|
|
midi_bytes = midi_file |
|
else: |
|
raise TypeError("midi_input must be a file path (str) or file content (bytes)") |
|
|
|
|
|
ticks_per_beat, *tracks = MIDI.midi2opus(midi_bytes) |
|
if not tracks: |
|
return None |
|
|
|
|
|
events = [] |
|
for track in tracks: |
|
abs_t = 0 |
|
for name, dt, *data in track: |
|
abs_t += dt |
|
events.append([name, abs_t, *data]) |
|
events.sort(key=lambda e: e[1]) |
|
|
|
|
|
fl = fluidsynth.Synth(samplerate=float(sample_rate)) |
|
sfid = fl.sfload(soundfont_path) |
|
for chan in range(16): |
|
|
|
fl.program_select(chan, sfid, 128 if chan == 9 else 0, 0) |
|
|
|
|
|
tempo = int((60 / 120) * 1e6) |
|
last_t = 0 |
|
|
|
|
|
audio_chunks = [] |
|
|
|
for name, cur_t, *data in events: |
|
|
|
delta_ticks = cur_t - last_t |
|
last_t = cur_t |
|
dt_seconds = (delta_ticks / ticks_per_beat) * (tempo / 1e6) |
|
sample_len = int(dt_seconds * sample_rate) |
|
|
|
if sample_len > 0: |
|
buf = fl.get_samples(sample_len).reshape(-1, 2) |
|
|
|
audio_chunks.append(buf) |
|
|
|
|
|
if name == "note_on" and data[2] > 0: |
|
chan, note, vel = data |
|
fl.noteon(chan, note, vel) |
|
|
|
elif name == "note_off" or (name == "note_on" and data[2] == 0): |
|
chan, note = data[:2] |
|
fl.noteoff(chan, note) |
|
|
|
elif name == "patch_change": |
|
chan, patch = data[:2] |
|
bank = 128 if chan == 9 else 0 |
|
fl.program_select(chan, sfid, bank, patch) |
|
|
|
elif name == "control_change": |
|
chan, ctrl, val = data[:3] |
|
fl.cc(chan, ctrl, val) |
|
|
|
elif name == "key_after_touch": |
|
chan, note, vel = data |
|
fl.key_pressure(chan, note, vel) |
|
|
|
elif name == "channel_after_touch": |
|
chan, vel = data |
|
fl.channel_pressure(chan, vel) |
|
|
|
elif name == "pitch_wheel_change": |
|
chan, wheel = data |
|
fl.pitch_bend(chan, wheel) |
|
|
|
elif name == "song_position": |
|
|
|
pass |
|
|
|
elif name == "song_select": |
|
|
|
pass |
|
|
|
elif name == "tune_request": |
|
|
|
pass |
|
|
|
elif name in ("sysex_f0", "sysex_f7"): |
|
raw_bytes = data[0] |
|
fl.sysex(raw_bytes) |
|
|
|
|
|
elif name in ( |
|
"set_tempo", |
|
"end_track", |
|
"text_event", "text_event_08", "text_event_09", "text_event_0a", |
|
"text_event_0b", "text_event_0c", "text_event_0d", "text_event_0e", "text_event_0f", |
|
"copyright_text_event", "track_name", "instrument_name", |
|
"lyric", "marker", "cue_point", |
|
"smpte_offset", "time_signature", "key_signature", |
|
"sequencer_specific", "raw_meta_event" |
|
): |
|
if name == "set_tempo": |
|
tempo = data[0] |
|
|
|
continue |
|
|
|
else: |
|
|
|
continue |
|
|
|
|
|
|
|
tail_len_seconds = 2 |
|
tail_buf = fl.get_samples(int(sample_rate * tail_len_seconds)).reshape(-1, 2) |
|
audio_chunks.append(tail_buf) |
|
|
|
|
|
fl.delete() |
|
|
|
|
|
if not audio_chunks: |
|
return None |
|
ss = np.concatenate(audio_chunks, axis=0) |
|
|
|
|
|
|
|
if trim_silence and ss.size: |
|
|
|
|
|
dtype_max = np.iinfo(ss.dtype).max |
|
fixed_threshold = int(dtype_max * 0.005) |
|
|
|
|
|
indices = np.where(np.abs(ss) > fixed_threshold)[0] |
|
if indices.size > 0: |
|
|
|
first_idx = indices[0] |
|
last_idx = indices[-1] |
|
ss = ss[first_idx : last_idx + 1] |
|
else: |
|
|
|
ss = np.empty((0, 2), dtype=ss.dtype) |
|
|
|
if ss.size: |
|
maxv = np.abs(ss).max() |
|
if maxv: |
|
ss = (ss / maxv) * np.iinfo(np.int16).max |
|
ss = ss.astype(np.int16) |
|
|
|
|
|
if output_for_gradio: |
|
return ss |
|
|
|
|
|
ss = ss.T |
|
raw_audio = normalize_audio(ss, target_level_db=volume_level_db) |
|
|
|
|
|
if write_audio_to_WAV and isinstance(midi_file, str): |
|
wav_name = midi_file.rsplit('.', 1)[0] + '.wav' |
|
|
|
if np.max(np.abs(raw_audio)) > 0: |
|
pcm = np.int16(raw_audio.T / np.max(np.abs(raw_audio)) * 32767) |
|
else: |
|
pcm = np.int16(raw_audio.T * 32767) |
|
|
|
with wave.open(wav_name, 'wb') as wf: |
|
wf.setframerate(sample_rate) |
|
wf.setsampwidth(2) |
|
wf.setnchannels(pcm.shape[1]) |
|
wf.writeframes(pcm.tobytes()) |
|
|
|
return raw_audio |
|
|
|
|