Audio-To-MIDI-And-Advanced-Renderer / src /midi_to_colab_audio.py
avans06's picture
Integrated and Enhanced: Audio-to-MIDI and Advanced MIDI Renderer
adcbc9f
r'''#===================================================================================================================
#
# MIDI to Colab AUdio Python Module
#
# Converts any MIDI file to raw audio which is compatible
# with Google Colab or HUgging Face Gradio
#
# Version 2.0
#
# Includes full source code of MIDI and pyfluidsynth
#
# Original source code for all modules was retrieved on 07/31/2025
#
# Project Los Angeles
# Tegridy Code 2025
#
#===================================================================================================================
#
# Critical dependencies
#
# pip install numpy
# sudo apt install fluidsynth
#
#===================================================================================================================
#
# Example usage:
#
# from midi_to_colab_audio import midi_to_colab_audio
# from IPython.display import display, Audio
#
# raw_audio = midi_to_colab_audio('/content/input.mid')
#
# display(Audio(raw_audio, rate=16000, normalize=False))
#
#===================================================================================================================
'''
import fluidsynth
from src import MIDI
#===============================================================================
import numpy as np
import wave
#===============================================================================
def normalize_audio(audio: np.ndarray,
method: str = 'peak',
target_level_db: float = -1.0,
per_channel: bool = False,
eps: float = 1e-9
) -> np.ndarray:
"""
Normalize audio to a target dBFS level.
Parameters
----------
audio : np.ndarray
Float-valued array in range [-1, 1] with shape (channels, samples)
or (samples,) for mono.
method : {'peak', 'rms'}
- 'peak': scale so that max(|audio|) = target_level_lin
- 'rms' : scale so that RMS(audio) = target_level_lin
target_level_db : float
Desired output level, in dBFS (0 dBFS = max digital full scale).
e.g. -1.0 dBFS means ~0.8913 linear gain.
per_channel : bool
If True, normalize each channel independently. Otherwise, use a
global measure across all channels.
eps : float
Small constant to avoid division by zero.
Returns
-------
normalized : np.ndarray
Audio array of same shape, scaled so that levels meet the target.
"""
# Convert target dB to linear gain
target_lin = 10 ** (target_level_db / 20.0)
# Ensure audio is float
audio = audio.astype(np.float32)
# if mono, make it (1, N)
if audio.ndim == 1:
audio = audio[np.newaxis, :]
# Choose measurement axis
axis = 1 if per_channel else None
if method == 'peak':
# Compute peak per channel or global
peak = np.max(np.abs(audio), axis=axis, keepdims=True)
peak = np.maximum(peak, eps)
scales = target_lin / peak
elif method == 'rms':
# Compute RMS per channel or global
rms = np.sqrt(np.mean(audio ** 2, axis=axis, keepdims=True))
rms = np.maximum(rms, eps)
scales = target_lin / rms
else:
raise ValueError(f"Unsupported method '{method}'; choose 'peak' or 'rms'.")
# Broadcast scales back to audio shape
normalized = audio * scales
# Clip just in case of rounding
return np.clip(normalized, -1.0, 1.0)
#===============================================================================
def midi_opus_to_colab_audio(midi_opus,
soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2',
sample_rate=16000, # 44100
volume_level_db=-1,
trim_silence=True,
silence_threshold=0.1,
output_for_gradio=False,
write_audio_to_WAV=''
):
if midi_opus[1]:
ticks_per_beat, *tracks = midi_opus
if not tracks:
return None
# Flatten & convert delta-times to absolute-time
events = []
for track in tracks:
abs_t = 0
for name, dt, *data in track:
abs_t += dt
events.append([name, abs_t, *data])
events.sort(key=lambda e: e[1])
# Setup FluidSynth
fl = fluidsynth.Synth(samplerate=float(sample_rate))
sfid = fl.sfload(soundfont_path)
for chan in range(16):
# channel 9 = percussion GM bank 128
fl.program_select(chan, sfid, 128 if chan == 9 else 0, 0)
# Playback vars
tempo = int((60 / 120) * 1e6) # default 120bpm
last_t = 0
ss = np.empty((0, 2), dtype=np.int16)
for name, cur_t, *data in events:
# compute how many samples have passed since the last event
delta_ticks = cur_t - last_t
last_t = cur_t
dt_seconds = (delta_ticks / ticks_per_beat) * (tempo / 1e6)
sample_len = int(dt_seconds * sample_rate)
if sample_len > 0:
buf = fl.get_samples(sample_len).reshape(-1, 2)
ss = np.concatenate([ss, buf], axis=0)
# Dispatch every known event
if name == "note_on" and data[2] > 0:
chan, note, vel = data
fl.noteon(chan, note, vel)
elif name == "note_off" or (name == "note_on" and data[2] == 0):
chan, note = data[:2]
fl.noteoff(chan, note)
elif name == "patch_change":
chan, patch = data[:2]
bank = 128 if chan == 9 else 0
fl.program_select(chan, sfid, bank, patch)
elif name == "control_change":
chan, ctrl, val = data[:3]
fl.cc(chan, ctrl, val)
elif name == "key_after_touch":
chan, note, vel = data
fl.key_pressure(chan, note, vel)
elif name == "channel_after_touch":
chan, vel = data
fl.channel_pressure(chan, vel)
elif name == "pitch_wheel_change":
chan, wheel = data
fl.pitch_bend(chan, wheel)
elif name == "song_position":
# song_pos = data[0]; # often not needed for playback
pass
elif name == "song_select":
# song_number = data[0]
pass
elif name == "tune_request":
# typically resets tuning; FS handles internally
pass
elif name in ("sysex_f0", "sysex_f7"):
raw_bytes = data[0]
fl.sysex(raw_bytes)
# Meta events & others—no direct audio effect, so we skip or log
elif name in (
"set_tempo", # handled below
"end_track",
"text_event", "text_event_08", "text_event_09", "text_event_0a",
"text_event_0b", "text_event_0c", "text_event_0d", "text_event_0e", "text_event_0f",
"copyright_text_event", "track_name", "instrument_name",
"lyric", "marker", "cue_point",
"smpte_offset", "time_signature", "key_signature",
"sequencer_specific", "raw_meta_event"
):
if name == "set_tempo":
tempo = data[0]
# else: skip all other meta & text; you could hook in logging here
continue
else:
# unknown event type
continue
# Cleanup synth
fl.delete()
if ss.size:
maxv = np.abs(ss).max()
if maxv:
ss = (ss / maxv) * np.iinfo(np.int16).max
ss = ss.astype(np.int16)
# Optional trimming of trailing silence
if trim_silence and ss.size:
thresh = np.std(np.abs(ss)) * silence_threshold
idx = np.where(np.abs(ss) > thresh)[0]
if idx.size:
ss = ss[: idx[-1] + 1]
# For Gradio you might want raw int16 PCM
if output_for_gradio:
return ss
# Swap to (channels, samples) and normalize for playback
ss = ss.T
raw_audio = normalize_audio(ss, target_level_db=volume_level_db)
# Optionally write WAV to disk
if write_audio_to_WAV:
wav_name = midi_file.rsplit('.', 1)[0] + '.wav'
pcm = np.int16(raw_audio.T / np.max(np.abs(raw_audio)) * 32767)
with wave.open(wav_name, 'wb') as wf:
wf.setframerate(sample_rate)
wf.setsampwidth(2)
wf.setnchannels(pcm.shape[1])
wf.writeframes(pcm.tobytes())
return raw_audio
else:
return None
#===============================================================================
def midi_to_colab_audio(midi_file,
soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2',
sample_rate=16000,
volume_level_db=-1,
trim_silence=True,
silence_threshold=0.1,
output_for_gradio=False,
write_audio_to_WAV=False
):
"""
Returns raw audio to pass to IPython.disaply.Audio func
Example usage:
from IPython.display import Audio
display(Audio(raw_audio, rate=16000, normalize=False))
"""
# Check if midi_input is a path (string) or file content (bytes)
if isinstance(midi_file, str):
# It's a file path, open and read it.
try:
with open(midi_file, 'rb') as f:
midi_bytes = f.read()
except FileNotFoundError:
print(f"Error: Could not find or open the file at {midi_file}")
return None # Or handle the error appropriately
elif isinstance(midi_file, bytes):
# It's already the file content.
midi_bytes = midi_file
else:
raise TypeError("midi_input must be a file path (str) or file content (bytes)")
# Read and decode MIDI → opus event list from bytes
ticks_per_beat, *tracks = MIDI.midi2opus(midi_bytes)
if not tracks:
return None
# Flatten & convert delta-times to absolute-time
events = []
for track in tracks:
abs_t = 0
for name, dt, *data in track:
abs_t += dt
events.append([name, abs_t, *data])
events.sort(key=lambda e: e[1])
# Setup FluidSynth
fl = fluidsynth.Synth(samplerate=float(sample_rate))
sfid = fl.sfload(soundfont_path)
for chan in range(16):
# channel 9 = percussion GM bank 128
fl.program_select(chan, sfid, 128 if chan == 9 else 0, 0)
# Playback vars
tempo = int((60 / 120) * 1e6) # default 120bpm
last_t = 0
# Initialize a Python list to store audio chunks
audio_chunks = []
for name, cur_t, *data in events:
# compute how many samples have passed since the last event
delta_ticks = cur_t - last_t
last_t = cur_t
dt_seconds = (delta_ticks / ticks_per_beat) * (tempo / 1e6)
sample_len = int(dt_seconds * sample_rate)
if sample_len > 0:
buf = fl.get_samples(sample_len).reshape(-1, 2)
# Append the audio chunk to the list
audio_chunks.append(buf)
# Dispatch every known event
if name == "note_on" and data[2] > 0:
chan, note, vel = data
fl.noteon(chan, note, vel)
elif name == "note_off" or (name == "note_on" and data[2] == 0):
chan, note = data[:2]
fl.noteoff(chan, note)
elif name == "patch_change":
chan, patch = data[:2]
bank = 128 if chan == 9 else 0
fl.program_select(chan, sfid, bank, patch)
elif name == "control_change":
chan, ctrl, val = data[:3]
fl.cc(chan, ctrl, val)
elif name == "key_after_touch":
chan, note, vel = data
fl.key_pressure(chan, note, vel)
elif name == "channel_after_touch":
chan, vel = data
fl.channel_pressure(chan, vel)
elif name == "pitch_wheel_change":
chan, wheel = data
fl.pitch_bend(chan, wheel)
elif name == "song_position":
# song_pos = data[0]; # often not needed for playback
pass
elif name == "song_select":
# song_number = data[0]
pass
elif name == "tune_request":
# typically resets tuning; FS handles internally
pass
elif name in ("sysex_f0", "sysex_f7"):
raw_bytes = data[0]
fl.sysex(raw_bytes)
# Meta events & others—no direct audio effect, so we skip or log
elif name in (
"set_tempo", # handled below
"end_track",
"text_event", "text_event_08", "text_event_09", "text_event_0a",
"text_event_0b", "text_event_0c", "text_event_0d", "text_event_0e", "text_event_0f",
"copyright_text_event", "track_name", "instrument_name",
"lyric", "marker", "cue_point",
"smpte_offset", "time_signature", "key_signature",
"sequencer_specific", "raw_meta_event"
):
if name == "set_tempo":
tempo = data[0]
# else: skip all other meta & text; you could hook in logging here
continue
else:
# unknown event type
continue
# This captures the sound of the last notes, allowing them to decay naturally.
# We render an extra 2 seconds of audio. A shorter time like 1 second might be sufficient.
tail_len_seconds = 2
tail_buf = fl.get_samples(int(sample_rate * tail_len_seconds)).reshape(-1, 2)
audio_chunks.append(tail_buf)
# Cleanup synth
fl.delete()
# After the loop finishes, concatenate all audio chunks in a single operation
if not audio_chunks:
return None # No audio was generated
ss = np.concatenate(audio_chunks, axis=0)
# Optimized silence trimming logic
if trim_silence and ss.size:
# Using a fixed amplitude threshold based on the data type's max value.
# This is more robust than using standard deviation for trimming the tail.
dtype_max = np.iinfo(ss.dtype).max
fixed_threshold = int(dtype_max * 0.005) # 0.5% of max amplitude
# Find the first and last samples exceeding the threshold.
indices = np.where(np.abs(ss) > fixed_threshold)[0]
if indices.size > 0:
# We trim from the start as well in case of leading silence
first_idx = indices[0]
last_idx = indices[-1]
ss = ss[first_idx : last_idx + 1]
else:
# If it's all silence, return an empty array.
ss = np.empty((0, 2), dtype=ss.dtype)
if ss.size:
maxv = np.abs(ss).max()
if maxv:
ss = (ss / maxv) * np.iinfo(np.int16).max
ss = ss.astype(np.int16)
# For Gradio you might want raw int16 PCM
if output_for_gradio:
return ss
# Swap to (channels, samples) and normalize for playback
ss = ss.T
raw_audio = normalize_audio(ss, target_level_db=volume_level_db)
# Optionally write WAV to disk
if write_audio_to_WAV and isinstance(midi_file, str):
wav_name = midi_file.rsplit('.', 1)[0] + '.wav'
# Note: raw_audio is float, needs conversion back to int16 for WAV format.
if np.max(np.abs(raw_audio)) > 0:
pcm = np.int16(raw_audio.T / np.max(np.abs(raw_audio)) * 32767)
else:
pcm = np.int16(raw_audio.T * 32767)
with wave.open(wav_name, 'wb') as wf:
wf.setframerate(sample_rate)
wf.setsampwidth(2)
wf.setnchannels(pcm.shape[1])
wf.writeframes(pcm.tobytes())
return raw_audio
#===================================================================================================================