r'''#=================================================================================================================== # # MIDI to Colab AUdio Python Module # # Converts any MIDI file to raw audio which is compatible # with Google Colab or HUgging Face Gradio # # Version 2.0 # # Includes full source code of MIDI and pyfluidsynth # # Original source code for all modules was retrieved on 07/31/2025 # # Project Los Angeles # Tegridy Code 2025 # #=================================================================================================================== # # Critical dependencies # # pip install numpy # sudo apt install fluidsynth # #=================================================================================================================== # # Example usage: # # from midi_to_colab_audio import midi_to_colab_audio # from IPython.display import display, Audio # # raw_audio = midi_to_colab_audio('/content/input.mid') # # display(Audio(raw_audio, rate=16000, normalize=False)) # #=================================================================================================================== ''' import fluidsynth from src import MIDI #=============================================================================== import numpy as np import wave #=============================================================================== def normalize_audio(audio: np.ndarray, method: str = 'peak', target_level_db: float = -1.0, per_channel: bool = False, eps: float = 1e-9 ) -> np.ndarray: """ Normalize audio to a target dBFS level. Parameters ---------- audio : np.ndarray Float-valued array in range [-1, 1] with shape (channels, samples) or (samples,) for mono. method : {'peak', 'rms'} - 'peak': scale so that max(|audio|) = target_level_lin - 'rms' : scale so that RMS(audio) = target_level_lin target_level_db : float Desired output level, in dBFS (0 dBFS = max digital full scale). e.g. -1.0 dBFS means ~0.8913 linear gain. per_channel : bool If True, normalize each channel independently. Otherwise, use a global measure across all channels. eps : float Small constant to avoid division by zero. Returns ------- normalized : np.ndarray Audio array of same shape, scaled so that levels meet the target. """ # Convert target dB to linear gain target_lin = 10 ** (target_level_db / 20.0) # Ensure audio is float audio = audio.astype(np.float32) # if mono, make it (1, N) if audio.ndim == 1: audio = audio[np.newaxis, :] # Choose measurement axis axis = 1 if per_channel else None if method == 'peak': # Compute peak per channel or global peak = np.max(np.abs(audio), axis=axis, keepdims=True) peak = np.maximum(peak, eps) scales = target_lin / peak elif method == 'rms': # Compute RMS per channel or global rms = np.sqrt(np.mean(audio ** 2, axis=axis, keepdims=True)) rms = np.maximum(rms, eps) scales = target_lin / rms else: raise ValueError(f"Unsupported method '{method}'; choose 'peak' or 'rms'.") # Broadcast scales back to audio shape normalized = audio * scales # Clip just in case of rounding return np.clip(normalized, -1.0, 1.0) #=============================================================================== def midi_opus_to_colab_audio(midi_opus, soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2', sample_rate=16000, # 44100 volume_level_db=-1, trim_silence=True, silence_threshold=0.1, output_for_gradio=False, write_audio_to_WAV='' ): if midi_opus[1]: ticks_per_beat, *tracks = midi_opus if not tracks: return None # Flatten & convert delta-times to absolute-time events = [] for track in tracks: abs_t = 0 for name, dt, *data in track: abs_t += dt events.append([name, abs_t, *data]) events.sort(key=lambda e: e[1]) # Setup FluidSynth fl = fluidsynth.Synth(samplerate=float(sample_rate)) sfid = fl.sfload(soundfont_path) for chan in range(16): # channel 9 = percussion GM bank 128 fl.program_select(chan, sfid, 128 if chan == 9 else 0, 0) # Playback vars tempo = int((60 / 120) * 1e6) # default 120bpm last_t = 0 ss = np.empty((0, 2), dtype=np.int16) for name, cur_t, *data in events: # compute how many samples have passed since the last event delta_ticks = cur_t - last_t last_t = cur_t dt_seconds = (delta_ticks / ticks_per_beat) * (tempo / 1e6) sample_len = int(dt_seconds * sample_rate) if sample_len > 0: buf = fl.get_samples(sample_len).reshape(-1, 2) ss = np.concatenate([ss, buf], axis=0) # Dispatch every known event if name == "note_on" and data[2] > 0: chan, note, vel = data fl.noteon(chan, note, vel) elif name == "note_off" or (name == "note_on" and data[2] == 0): chan, note = data[:2] fl.noteoff(chan, note) elif name == "patch_change": chan, patch = data[:2] bank = 128 if chan == 9 else 0 fl.program_select(chan, sfid, bank, patch) elif name == "control_change": chan, ctrl, val = data[:3] fl.cc(chan, ctrl, val) elif name == "key_after_touch": chan, note, vel = data fl.key_pressure(chan, note, vel) elif name == "channel_after_touch": chan, vel = data fl.channel_pressure(chan, vel) elif name == "pitch_wheel_change": chan, wheel = data fl.pitch_bend(chan, wheel) elif name == "song_position": # song_pos = data[0]; # often not needed for playback pass elif name == "song_select": # song_number = data[0] pass elif name == "tune_request": # typically resets tuning; FS handles internally pass elif name in ("sysex_f0", "sysex_f7"): raw_bytes = data[0] fl.sysex(raw_bytes) # Meta events & others—no direct audio effect, so we skip or log elif name in ( "set_tempo", # handled below "end_track", "text_event", "text_event_08", "text_event_09", "text_event_0a", "text_event_0b", "text_event_0c", "text_event_0d", "text_event_0e", "text_event_0f", "copyright_text_event", "track_name", "instrument_name", "lyric", "marker", "cue_point", "smpte_offset", "time_signature", "key_signature", "sequencer_specific", "raw_meta_event" ): if name == "set_tempo": tempo = data[0] # else: skip all other meta & text; you could hook in logging here continue else: # unknown event type continue # Cleanup synth fl.delete() if ss.size: maxv = np.abs(ss).max() if maxv: ss = (ss / maxv) * np.iinfo(np.int16).max ss = ss.astype(np.int16) # Optional trimming of trailing silence if trim_silence and ss.size: thresh = np.std(np.abs(ss)) * silence_threshold idx = np.where(np.abs(ss) > thresh)[0] if idx.size: ss = ss[: idx[-1] + 1] # For Gradio you might want raw int16 PCM if output_for_gradio: return ss # Swap to (channels, samples) and normalize for playback ss = ss.T raw_audio = normalize_audio(ss, target_level_db=volume_level_db) # Optionally write WAV to disk if write_audio_to_WAV: wav_name = midi_file.rsplit('.', 1)[0] + '.wav' pcm = np.int16(raw_audio.T / np.max(np.abs(raw_audio)) * 32767) with wave.open(wav_name, 'wb') as wf: wf.setframerate(sample_rate) wf.setsampwidth(2) wf.setnchannels(pcm.shape[1]) wf.writeframes(pcm.tobytes()) return raw_audio else: return None #=============================================================================== def midi_to_colab_audio(midi_file, soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2', sample_rate=16000, volume_level_db=-1, trim_silence=True, silence_threshold=0.1, output_for_gradio=False, write_audio_to_WAV=False ): """ Returns raw audio to pass to IPython.disaply.Audio func Example usage: from IPython.display import Audio display(Audio(raw_audio, rate=16000, normalize=False)) """ # Check if midi_input is a path (string) or file content (bytes) if isinstance(midi_file, str): # It's a file path, open and read it. try: with open(midi_file, 'rb') as f: midi_bytes = f.read() except FileNotFoundError: print(f"Error: Could not find or open the file at {midi_file}") return None # Or handle the error appropriately elif isinstance(midi_file, bytes): # It's already the file content. midi_bytes = midi_file else: raise TypeError("midi_input must be a file path (str) or file content (bytes)") # Read and decode MIDI → opus event list from bytes ticks_per_beat, *tracks = MIDI.midi2opus(midi_bytes) if not tracks: return None # Flatten & convert delta-times to absolute-time events = [] for track in tracks: abs_t = 0 for name, dt, *data in track: abs_t += dt events.append([name, abs_t, *data]) events.sort(key=lambda e: e[1]) # Setup FluidSynth fl = fluidsynth.Synth(samplerate=float(sample_rate)) sfid = fl.sfload(soundfont_path) for chan in range(16): # channel 9 = percussion GM bank 128 fl.program_select(chan, sfid, 128 if chan == 9 else 0, 0) # Playback vars tempo = int((60 / 120) * 1e6) # default 120bpm last_t = 0 # Initialize a Python list to store audio chunks audio_chunks = [] for name, cur_t, *data in events: # compute how many samples have passed since the last event delta_ticks = cur_t - last_t last_t = cur_t dt_seconds = (delta_ticks / ticks_per_beat) * (tempo / 1e6) sample_len = int(dt_seconds * sample_rate) if sample_len > 0: buf = fl.get_samples(sample_len).reshape(-1, 2) # Append the audio chunk to the list audio_chunks.append(buf) # Dispatch every known event if name == "note_on" and data[2] > 0: chan, note, vel = data fl.noteon(chan, note, vel) elif name == "note_off" or (name == "note_on" and data[2] == 0): chan, note = data[:2] fl.noteoff(chan, note) elif name == "patch_change": chan, patch = data[:2] bank = 128 if chan == 9 else 0 fl.program_select(chan, sfid, bank, patch) elif name == "control_change": chan, ctrl, val = data[:3] fl.cc(chan, ctrl, val) elif name == "key_after_touch": chan, note, vel = data fl.key_pressure(chan, note, vel) elif name == "channel_after_touch": chan, vel = data fl.channel_pressure(chan, vel) elif name == "pitch_wheel_change": chan, wheel = data fl.pitch_bend(chan, wheel) elif name == "song_position": # song_pos = data[0]; # often not needed for playback pass elif name == "song_select": # song_number = data[0] pass elif name == "tune_request": # typically resets tuning; FS handles internally pass elif name in ("sysex_f0", "sysex_f7"): raw_bytes = data[0] fl.sysex(raw_bytes) # Meta events & others—no direct audio effect, so we skip or log elif name in ( "set_tempo", # handled below "end_track", "text_event", "text_event_08", "text_event_09", "text_event_0a", "text_event_0b", "text_event_0c", "text_event_0d", "text_event_0e", "text_event_0f", "copyright_text_event", "track_name", "instrument_name", "lyric", "marker", "cue_point", "smpte_offset", "time_signature", "key_signature", "sequencer_specific", "raw_meta_event" ): if name == "set_tempo": tempo = data[0] # else: skip all other meta & text; you could hook in logging here continue else: # unknown event type continue # This captures the sound of the last notes, allowing them to decay naturally. # We render an extra 2 seconds of audio. A shorter time like 1 second might be sufficient. tail_len_seconds = 2 tail_buf = fl.get_samples(int(sample_rate * tail_len_seconds)).reshape(-1, 2) audio_chunks.append(tail_buf) # Cleanup synth fl.delete() # After the loop finishes, concatenate all audio chunks in a single operation if not audio_chunks: return None # No audio was generated ss = np.concatenate(audio_chunks, axis=0) # Optimized silence trimming logic if trim_silence and ss.size: # Using a fixed amplitude threshold based on the data type's max value. # This is more robust than using standard deviation for trimming the tail. dtype_max = np.iinfo(ss.dtype).max fixed_threshold = int(dtype_max * 0.005) # 0.5% of max amplitude # Find the first and last samples exceeding the threshold. indices = np.where(np.abs(ss) > fixed_threshold)[0] if indices.size > 0: # We trim from the start as well in case of leading silence first_idx = indices[0] last_idx = indices[-1] ss = ss[first_idx : last_idx + 1] else: # If it's all silence, return an empty array. ss = np.empty((0, 2), dtype=ss.dtype) if ss.size: maxv = np.abs(ss).max() if maxv: ss = (ss / maxv) * np.iinfo(np.int16).max ss = ss.astype(np.int16) # For Gradio you might want raw int16 PCM if output_for_gradio: return ss # Swap to (channels, samples) and normalize for playback ss = ss.T raw_audio = normalize_audio(ss, target_level_db=volume_level_db) # Optionally write WAV to disk if write_audio_to_WAV and isinstance(midi_file, str): wav_name = midi_file.rsplit('.', 1)[0] + '.wav' # Note: raw_audio is float, needs conversion back to int16 for WAV format. if np.max(np.abs(raw_audio)) > 0: pcm = np.int16(raw_audio.T / np.max(np.abs(raw_audio)) * 32767) else: pcm = np.int16(raw_audio.T * 32767) with wave.open(wav_name, 'wb') as wf: wf.setframerate(sample_rate) wf.setsampwidth(2) wf.setnchannels(pcm.shape[1]) wf.writeframes(pcm.tobytes()) return raw_audio #===================================================================================================================