Audio-To-MIDI-And-Advanced-Renderer

Running

File size: 16,407 Bytes

adcbc9f

r'''#===================================================================================================================
#
# MIDI to Colab AUdio Python Module
#
# Converts any MIDI file to raw audio which is compatible 
# with Google Colab or HUgging Face Gradio
#
# Version 2.0
#
# Includes full source code of MIDI and pyfluidsynth
# 
# Original source code for all modules was retrieved on 07/31/2025
#
# Project Los Angeles
# Tegridy Code 2025
#
#===================================================================================================================
#
# Critical dependencies
#
# pip install numpy
# sudo apt install fluidsynth
#
#===================================================================================================================
# 
# Example usage:
#
# from midi_to_colab_audio import midi_to_colab_audio
# from IPython.display import display, Audio
#
# raw_audio = midi_to_colab_audio('/content/input.mid')
#
# display(Audio(raw_audio, rate=16000, normalize=False))
#
#===================================================================================================================
'''

import fluidsynth
from src import MIDI

#===============================================================================

import numpy as np
import wave

#===============================================================================

def normalize_audio(audio: np.ndarray,
                    method: str = 'peak',
                    target_level_db: float = -1.0,
                    per_channel: bool = False,
                    eps: float = 1e-9
                   ) -> np.ndarray:
    
    """
    Normalize audio to a target dBFS level.

    Parameters
    ----------
    audio : np.ndarray
        Float-valued array in range [-1, 1] with shape (channels, samples)
        or (samples,) for mono.
    method : {'peak', 'rms'}
        - 'peak': scale so that max(|audio|) = target_level_lin  
        - 'rms' : scale so that RMS(audio) = target_level_lin
    target_level_db : float
        Desired output level, in dBFS (0 dBFS = max digital full scale).
        e.g. -1.0 dBFS means ~0.8913 linear gain.
    per_channel : bool
        If True, normalize each channel independently. Otherwise, use a
        global measure across all channels.
    eps : float
        Small constant to avoid division by zero.

    Returns
    -------
    normalized : np.ndarray
        Audio array of same shape, scaled so that levels meet the target.
    """
    
    # Convert target dB to linear gain
    target_lin = 10 ** (target_level_db / 20.0)

    # Ensure audio is float
    audio = audio.astype(np.float32)

    # if mono, make it (1, N)
    if audio.ndim == 1:
        audio = audio[np.newaxis, :]

    # Choose measurement axis
    axis = 1 if per_channel else None

    if method == 'peak':
        # Compute peak per channel or global
        peak = np.max(np.abs(audio), axis=axis, keepdims=True)
        peak = np.maximum(peak, eps)
        scales = target_lin / peak

    elif method == 'rms':
        # Compute RMS per channel or global
        rms = np.sqrt(np.mean(audio ** 2, axis=axis, keepdims=True))
        rms = np.maximum(rms, eps)
        scales = target_lin / rms

    else:
        raise ValueError(f"Unsupported method '{method}'; choose 'peak' or 'rms'.")

    # Broadcast scales back to audio shape
    normalized = audio * scales

    # Clip just in case of rounding
    return np.clip(normalized, -1.0, 1.0)

#===============================================================================

def midi_opus_to_colab_audio(midi_opus, 
                              soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2', 
                              sample_rate=16000, # 44100
                              volume_level_db=-1,
                              trim_silence=True,
                              silence_threshold=0.1,
                              output_for_gradio=False,
                              write_audio_to_WAV=''
                              ):

    if midi_opus[1]:

        ticks_per_beat, *tracks = midi_opus
        if not tracks:
            return None
    
        # Flatten & convert delta-times to absolute-time
        events = []
        for track in tracks:
            abs_t = 0
            for name, dt, *data in track:
                abs_t += dt
                events.append([name, abs_t, *data])
        events.sort(key=lambda e: e[1])
    
        # Setup FluidSynth
        fl = fluidsynth.Synth(samplerate=float(sample_rate))
        sfid = fl.sfload(soundfont_path)
        for chan in range(16):
            # channel 9 = percussion GM bank 128
            fl.program_select(chan, sfid, 128 if chan == 9 else 0, 0)
    
        # Playback vars
        tempo = int((60 / 120) * 1e6)  # default 120bpm
        last_t = 0
        ss = np.empty((0, 2), dtype=np.int16)
    
        for name, cur_t, *data in events:
            # compute how many samples have passed since the last event
            delta_ticks = cur_t - last_t
            last_t = cur_t
            dt_seconds = (delta_ticks / ticks_per_beat) * (tempo / 1e6)
            sample_len = int(dt_seconds * sample_rate)
            if sample_len > 0:
                buf = fl.get_samples(sample_len).reshape(-1, 2)
                ss = np.concatenate([ss, buf], axis=0)
    
            # Dispatch every known event
            if name == "note_on" and data[2] > 0:
                chan, note, vel = data
                fl.noteon(chan, note, vel)
    
            elif name == "note_off" or (name == "note_on" and data[2] == 0):
                chan, note = data[:2]
                fl.noteoff(chan, note)
    
            elif name == "patch_change":
                chan, patch = data[:2]
                bank = 128 if chan == 9 else 0
                fl.program_select(chan, sfid, bank, patch)
    
            elif name == "control_change":
                chan, ctrl, val = data[:3]
                fl.cc(chan, ctrl, val)
    
            elif name == "key_after_touch":
                chan, note, vel = data
                fl.key_pressure(chan, note, vel)
    
            elif name == "channel_after_touch":
                chan, vel = data
                fl.channel_pressure(chan, vel)
    
            elif name == "pitch_wheel_change":
                chan, wheel = data
                fl.pitch_bend(chan, wheel)
    
            elif name == "song_position":
                # song_pos = data[0];  # often not needed for playback
                pass
    
            elif name == "song_select":
                # song_number = data[0]
                pass
    
            elif name == "tune_request":
                # typically resets tuning; FS handles internally
                pass
    
            elif name in ("sysex_f0", "sysex_f7"):
                raw_bytes = data[0]
                fl.sysex(raw_bytes)
    
            # Meta events & others—no direct audio effect, so we skip or log
            elif name in (
                "set_tempo",       # handled below
                "end_track",
                "text_event", "text_event_08", "text_event_09", "text_event_0a",
                "text_event_0b", "text_event_0c", "text_event_0d", "text_event_0e", "text_event_0f",
                "copyright_text_event", "track_name", "instrument_name",
                "lyric", "marker", "cue_point",
                "smpte_offset", "time_signature", "key_signature",
                "sequencer_specific", "raw_meta_event"
            ):
                if name == "set_tempo":
                    tempo = data[0]
                # else: skip all other meta & text; you could hook in logging here
                continue
    
            else:
                # unknown event type
                continue
    
        # Cleanup synth
        fl.delete()
    
        if ss.size:
            maxv = np.abs(ss).max()
            if maxv:
                ss = (ss / maxv) * np.iinfo(np.int16).max
        ss = ss.astype(np.int16)
    
        # Optional trimming of trailing silence
        if trim_silence and ss.size:
            thresh = np.std(np.abs(ss)) * silence_threshold
            idx = np.where(np.abs(ss) > thresh)[0]
            if idx.size:
                ss = ss[: idx[-1] + 1]
    
        # For Gradio you might want raw int16 PCM
        if output_for_gradio:
            return ss
    
        # Swap to (channels, samples) and normalize for playback
        ss = ss.T
        raw_audio = normalize_audio(ss, target_level_db=volume_level_db)
    
        # Optionally write WAV to disk
        if write_audio_to_WAV:
            wav_name = midi_file.rsplit('.', 1)[0] + '.wav'
            pcm = np.int16(raw_audio.T / np.max(np.abs(raw_audio)) * 32767)
            with wave.open(wav_name, 'wb') as wf:
                wf.setframerate(sample_rate)
                wf.setsampwidth(2)
                wf.setnchannels(pcm.shape[1])
                wf.writeframes(pcm.tobytes())
    
        return raw_audio
  
    else:
      return None

#===============================================================================

def midi_to_colab_audio(midi_file,
                        soundfont_path='/usr/share/sounds/sf2/FluidR3_GM.sf2',
                        sample_rate=16000,
                        volume_level_db=-1,
                        trim_silence=True,
                        silence_threshold=0.1,
                        output_for_gradio=False,
                        write_audio_to_WAV=False
                       ):
    """
    Returns raw audio to pass to IPython.disaply.Audio func

    Example usage:

    from IPython.display import Audio

    display(Audio(raw_audio, rate=16000, normalize=False))
    """

    # Check if midi_input is a path (string) or file content (bytes)
    if isinstance(midi_file, str):
        # It's a file path, open and read it.
        try:
            with open(midi_file, 'rb') as f:
                midi_bytes = f.read()
        except FileNotFoundError:
            print(f"Error: Could not find or open the file at {midi_file}")
            return None # Or handle the error appropriately
    elif isinstance(midi_file, bytes):
        # It's already the file content.
        midi_bytes = midi_file
    else:
        raise TypeError("midi_input must be a file path (str) or file content (bytes)")

    # Read and decode MIDI → opus event list from bytes
    ticks_per_beat, *tracks = MIDI.midi2opus(midi_bytes)
    if not tracks:
        return None

    # Flatten & convert delta-times to absolute-time
    events = []
    for track in tracks:
        abs_t = 0
        for name, dt, *data in track:
            abs_t += dt
            events.append([name, abs_t, *data])
    events.sort(key=lambda e: e[1])

    # Setup FluidSynth
    fl = fluidsynth.Synth(samplerate=float(sample_rate))
    sfid = fl.sfload(soundfont_path)
    for chan in range(16):
        # channel 9 = percussion GM bank 128
        fl.program_select(chan, sfid, 128 if chan == 9 else 0, 0)

    # Playback vars
    tempo = int((60 / 120) * 1e6)  # default 120bpm
    last_t = 0
    
    # Initialize a Python list to store audio chunks
    audio_chunks = []

    for name, cur_t, *data in events:
        # compute how many samples have passed since the last event
        delta_ticks = cur_t - last_t
        last_t = cur_t
        dt_seconds = (delta_ticks / ticks_per_beat) * (tempo / 1e6)
        sample_len = int(dt_seconds * sample_rate)

        if sample_len > 0:
            buf = fl.get_samples(sample_len).reshape(-1, 2)
            # Append the audio chunk to the list
            audio_chunks.append(buf)

        # Dispatch every known event
        if name == "note_on" and data[2] > 0:
            chan, note, vel = data
            fl.noteon(chan, note, vel)

        elif name == "note_off" or (name == "note_on" and data[2] == 0):
            chan, note = data[:2]
            fl.noteoff(chan, note)

        elif name == "patch_change":
            chan, patch = data[:2]
            bank = 128 if chan == 9 else 0
            fl.program_select(chan, sfid, bank, patch)

        elif name == "control_change":
            chan, ctrl, val = data[:3]
            fl.cc(chan, ctrl, val)

        elif name == "key_after_touch":
            chan, note, vel = data
            fl.key_pressure(chan, note, vel)

        elif name == "channel_after_touch":
            chan, vel = data
            fl.channel_pressure(chan, vel)

        elif name == "pitch_wheel_change":
            chan, wheel = data
            fl.pitch_bend(chan, wheel)

        elif name == "song_position":
            # song_pos = data[0];  # often not needed for playback
            pass

        elif name == "song_select":
            # song_number = data[0]
            pass

        elif name == "tune_request":
            # typically resets tuning; FS handles internally
            pass

        elif name in ("sysex_f0", "sysex_f7"):
            raw_bytes = data[0]
            fl.sysex(raw_bytes)

        # Meta events & others—no direct audio effect, so we skip or log
        elif name in (
            "set_tempo",       # handled below
            "end_track",
            "text_event", "text_event_08", "text_event_09", "text_event_0a",
            "text_event_0b", "text_event_0c", "text_event_0d", "text_event_0e", "text_event_0f",
            "copyright_text_event", "track_name", "instrument_name",
            "lyric", "marker", "cue_point",
            "smpte_offset", "time_signature", "key_signature",
            "sequencer_specific", "raw_meta_event"
        ):
            if name == "set_tempo":
                tempo = data[0]
            # else: skip all other meta & text; you could hook in logging here
            continue

        else:
            # unknown event type
            continue

    # This captures the sound of the last notes, allowing them to decay naturally.
    # We render an extra 2 seconds of audio. A shorter time like 1 second might be sufficient.
    tail_len_seconds = 2
    tail_buf = fl.get_samples(int(sample_rate * tail_len_seconds)).reshape(-1, 2)
    audio_chunks.append(tail_buf)
    
    # Cleanup synth
    fl.delete()
    
    # After the loop finishes, concatenate all audio chunks in a single operation
    if not audio_chunks:
        return None # No audio was generated
    ss = np.concatenate(audio_chunks, axis=0)
    

    # Optimized silence trimming logic
    if trim_silence and ss.size:
        # Using a fixed amplitude threshold based on the data type's max value.
        # This is more robust than using standard deviation for trimming the tail.
        dtype_max = np.iinfo(ss.dtype).max
        fixed_threshold = int(dtype_max * 0.005) # 0.5% of max amplitude
        
        # Find the first and last samples exceeding the threshold.
        indices = np.where(np.abs(ss) > fixed_threshold)[0]
        if indices.size > 0:
            # We trim from the start as well in case of leading silence
            first_idx = indices[0]
            last_idx = indices[-1]
            ss = ss[first_idx : last_idx + 1]
        else:
            # If it's all silence, return an empty array.
            ss = np.empty((0, 2), dtype=ss.dtype)

    if ss.size:
        maxv = np.abs(ss).max()
        if maxv:
            ss = (ss / maxv) * np.iinfo(np.int16).max
    ss = ss.astype(np.int16)

    # For Gradio you might want raw int16 PCM
    if output_for_gradio:
        return ss

    # Swap to (channels, samples) and normalize for playback
    ss = ss.T
    raw_audio = normalize_audio(ss, target_level_db=volume_level_db)

    # Optionally write WAV to disk
    if write_audio_to_WAV and isinstance(midi_file, str):
        wav_name = midi_file.rsplit('.', 1)[0] + '.wav'
        # Note: raw_audio is float, needs conversion back to int16 for WAV format.
        if np.max(np.abs(raw_audio)) > 0:
            pcm = np.int16(raw_audio.T / np.max(np.abs(raw_audio)) * 32767)
        else:
            pcm = np.int16(raw_audio.T * 32767)
            
        with wave.open(wav_name, 'wb') as wf:
            wf.setframerate(sample_rate)
            wf.setsampwidth(2)
            wf.setnchannels(pcm.shape[1])
            wf.writeframes(pcm.tobytes())

    return raw_audio

#===================================================================================================================