# ================================================================= # # Merged and Integrated Script for Audio/MIDI Processing and Rendering (Stereo Enhanced) # # This script combines two functionalities: # 1. Transcribing audio to MIDI using two methods: # a) A general-purpose model (basic-pitch by Spotify). # b) A model specialized for solo piano (ByteDance). # - Includes stereo processing by splitting channels, transcribing independently, and merging MIDI. # 2. Applying advanced transformations and re-rendering MIDI files using: # a) Standard SoundFonts via FluidSynth (produces stereo audio). # b) A custom 8-bit style synthesizer for a chiptune sound (updated for stereo output). # # The user can upload a Audio (e.g., WAV, MP3), or MIDI file. # - If an audio file is uploaded, it is first transcribed to MIDI using the selected method. # - The resulting MIDI (or an uploaded MIDI) can then be processed # with various effects and rendered into audio. # #================================================================ # Original sources: # https://huggingface.co/spaces/asigalov61/ByteDance-Solo-Piano-Audio-to-MIDI-Transcription # https://huggingface.co/spaces/asigalov61/Advanced-MIDI-Renderer #================================================================ # Packages: # # sudo apt install fluidsynth # # ================================================================= # Requirements: # # pip install gradio torch pytz numpy scipy matplotlib networkx scikit-learn # pip install piano_transcription_inference huggingface_hub # pip install basic-pitch pretty_midi librosa soundfile # # ================================================================= # Core modules: # # git clone --depth 1 https://github.com/asigalov61/tegridy-tools # # ================================================================= import io import os import hashlib import time as reqtime import copy import librosa import pyloudnorm as pyln import soundfile as sf import torch import ffmpeg import gradio as gr # --- Imports for Vocal Separation --- import torchaudio from demucs.apply import apply_model from demucs.pretrained import get_model from demucs.audio import convert_audio from src.piano_transcription.utils import initialize_app from piano_transcription_inference import PianoTranscription, utilities, sample_rate as transcription_sample_rate # --- Import core transcription and MIDI processing libraries --- from src import TMIDIX, TPLOTS from src import MIDI from src.midi_to_colab_audio import midi_to_colab_audio # --- Imports for General Purpose Transcription (basic-pitch) --- import basic_pitch from basic_pitch.inference import predict from basic_pitch import ICASSP_2022_MODEL_PATH # --- Imports for 8-bit Synthesizer & MIDI Merging --- import pretty_midi import numpy as np from scipy import signal # ================================================================================================= # === Hugging Face SoundFont Downloader === # ================================================================================================= from huggingface_hub import hf_hub_download import glob # --- Define a constant for the 8-bit synthesizer option --- SYNTH_8_BIT_LABEL = "None (8-bit Synthesizer)" def prepare_soundfonts(): """ Ensures a default set of SoundFonts are downloaded, then scans the 'src/sf2' directory recursively for all .sf2 files. Returns a dictionary mapping a user-friendly name to its full file path, with default soundfonts listed first in their specified order. Downloads soundfont files from the specified Hugging Face Space repository to a local 'src/sf2' directory if they don't already exist. Returns a list of local paths to the soundfont files. """ SF2_REPO_ID = "asigalov61/Advanced-MIDI-Renderer" SF2_DIR = "src/sf2" # This list is now just for ensuring default files exist # {"Super GM": 0, "Orpheus GM": 1, "Live HQ GM": 2, "Nice Strings + Orchestra": 3, "Real Choir": 4, "Super Game Boy": 5, "Proto Square": 6} DEFAULT_SF2_FILENAMES = [ "SGM-v2.01-YamahaGrand-Guit-Bass-v2.7.sf2", "Orpheus_18.06.2020.sf2", "Live HQ Natural SoundFont GM.sf2", "Nice-Strings-PlusOrchestra-v1.6.sf2", "KBH-Real-Choir-V2.5.sf2", "SuperGameBoy.sf2", "ProtoSquare.sf2" ] # Create the target directory if it doesn't exist os.makedirs(SF2_DIR, exist_ok=True) # --- Step 1: Ensure default SoundFonts are available --- print("Checking for SoundFont files...") for filename in DEFAULT_SF2_FILENAMES: local_path = os.path.join(SF2_DIR, filename) # Check if the file already exists locally to avoid re-downloading if not os.path.exists(local_path): print(f"Downloading '{filename}' from Hugging Face Hub...") try: # Use hf_hub_download to get the file # It will be downloaded to the specified local directory hf_hub_download( repo_id=SF2_REPO_ID, repo_type='space', # Specify that the repository is a Space filename=f"{filename}", # The path to the file within the repository local_dir=SF2_DIR, # local_dir_use_symlinks=False # Copy file to the dir for a clean folder structure ) print(f"'{filename}' downloaded successfully.") except Exception as e: print(f"Error downloading {filename}: {e}") # If download fails, we might not be able to use this soundfont # --- Step 2: Scan the entire directory for all .sf2 files --- print(f"Scanning '{SF2_DIR}' for all .sf2 files...") all_sfs_map = {} # Use glob with recursive=True to find all .sf2 files in subdirectories search_pattern = os.path.join(SF2_DIR, '**', '*.sf2') for full_path in glob.glob(search_pattern, recursive=True): # Create a user-friendly display name, including subfolder if it exists relative_path = os.path.relpath(full_path, SF2_DIR) display_name = os.path.splitext(relative_path)[0].replace("\\", "/") # Use forward slashes for consistency all_sfs_map[display_name] = full_path # --- Step 3: Create the final ordered dictionary based on priority --- ordered_soundfont_map = {} # Create display names for default files (filename without extension) default_display_names = [os.path.splitext(f)[0] for f in DEFAULT_SF2_FILENAMES] # Separate other files from the default ones other_display_names = [name for name in all_sfs_map.keys() if name not in default_display_names] other_display_names.sort() # Sort the rest alphabetically # Add default soundfonts first, maintaining the order from DEFAULT_SF2_FILENAMES for name in default_display_names: if name in all_sfs_map: # Check if the file was actually found by the scanner ordered_soundfont_map[name] = all_sfs_map[name] # Add all other soundfonts after the default ones for name in other_display_names: ordered_soundfont_map[name] = all_sfs_map[name] return ordered_soundfont_map # ================================================================================================= # === 8-bit Style Synthesizer (Stereo Enabled) === # ================================================================================================= def synthesize_8bit_style(midi_data, waveform_type, envelope_type, decay_time_s, pulse_width, vibrato_rate, vibrato_depth, bass_boost_level, fs=44100, smooth_notes_level=0.0, continuous_vibrato_level=0.0, noise_level=0.0, distortion_level=0.0, fm_modulation_depth=0.0, fm_modulation_rate=0.0): """ Synthesizes an 8-bit style audio waveform from a PrettyMIDI object. This function generates waveforms manually instead of using a synthesizer like FluidSynth. Includes an optional sub-octave bass booster with adjustable level. Instruments are panned based on their order in the MIDI file. Instrument 1 -> Left, Instrument 2 -> Right. Now supports graded levels for smoothing and vibrato continuity. """ total_duration = midi_data.get_end_time() # Initialize a stereo waveform buffer (2 channels: Left, Right) waveform = np.zeros((2, int(total_duration * fs) + fs)) num_instruments = len(midi_data.instruments) # Phase tracking: main oscillator phase for each instrument osc_phase = {} # Vibrato phase tracking vibrato_phase = 0.0 for i, instrument in enumerate(midi_data.instruments): # --- Panning Logic --- # Default to center-panned mono pan_l, pan_r = 0.707, 0.707 if num_instruments == 2: if i == 0: # First instrument panned left pan_l, pan_r = 1.0, 0.0 elif i == 1: # Second instrument panned right pan_l, pan_r = 0.0, 1.0 elif num_instruments > 2: if i == 0: # Left pan_l, pan_r = 1.0, 0.0 elif i == 1: # Right pan_l, pan_r = 0.0, 1.0 # Other instruments remain centered osc_phase[i] = 0.0 # Independent phase tracking for each instrument for note in instrument.notes: freq = pretty_midi.note_number_to_hz(note.pitch) note_duration = note.end - note.start num_samples = int(note_duration * fs) if num_samples <= 0: continue t = np.arange(num_samples) / fs # --- Graded Continuous Vibrato --- # This now interpolates between a fully reset vibrato and a fully continuous one. # Use accumulated phase to avoid vibrato reset per note vib_phase_inc = 2 * np.pi * vibrato_rate / fs per_note_vib_phase = 2 * np.pi * vibrato_rate * t continuous_vib_phase = vibrato_phase + np.arange(num_samples) * vib_phase_inc # Weighted average of the two phase types final_vib_phase = ( per_note_vib_phase * (1 - continuous_vibrato_level) + continuous_vib_phase * continuous_vibrato_level ) vibrato_lfo = vibrato_depth * np.sin(final_vib_phase) # Update the global vibrato phase for the next note if num_samples > 0: vibrato_phase = (continuous_vib_phase[-1] + vib_phase_inc) % (2 * np.pi) # --- Waveform Generation with FM --- fm_lfo = fm_modulation_depth * np.sin(2 * np.pi * fm_modulation_rate * t) modulated_freq = freq * (1 + fm_lfo) # --- Waveform Generation (Main Oscillator with phase continuity) --- phase_inc = 2 * np.pi * (modulated_freq + vibrato_lfo) / fs phase = osc_phase[i] + np.cumsum(phase_inc) if num_samples > 0: osc_phase[i] = phase[-1] % (2 * np.pi) # Store last phase if waveform_type == 'Square': note_waveform = signal.square(phase, duty=pulse_width) elif waveform_type == 'Sawtooth': note_waveform = signal.sawtooth(phase) else: # Triangle note_waveform = signal.sawtooth(phase, width=0.5) # --- Bass Boost (Sub-Octave Oscillator) --- if bass_boost_level > 0: bass_freq = freq / 2.0 # Only add bass if the frequency is reasonably audible if bass_freq > 20: # Bass uses a simple square wave, no vibrato, for stability bass_phase_inc = 2 * np.pi * bass_freq / fs bass_phase = np.cumsum(np.full(num_samples, bass_phase_inc)) bass_sub_waveform = signal.square(bass_phase, duty=0.5) # Mix the main and bass waveforms. # As bass level increases, slightly decrease main waveform volume to prevent clipping. main_level = 1.0 - (0.5 * bass_boost_level) note_waveform = (note_waveform * main_level) + (bass_sub_waveform * bass_boost_level) # --- Noise & Distortion Simulation (White Noise) --- if noise_level > 0: note_waveform += np.random.uniform(-1, 1, num_samples) * noise_level # --- Distortion (Wave Shaping) --- if distortion_level > 0: # Using a tanh function for a smoother, "warmer" distortion note_waveform = np.tanh(note_waveform * (1 + distortion_level * 5)) # --- ADSR Envelope --- start_amp = note.velocity / 127.0 envelope = np.zeros(num_samples) if envelope_type == 'Plucky (AD Envelope)': attack_samples = min(int(0.005 * fs), num_samples) decay_samples = min(int(decay_time_s * fs), num_samples - attack_samples) envelope[:attack_samples] = np.linspace(0, start_amp, attack_samples) if decay_samples > 0: envelope[attack_samples:attack_samples+decay_samples] = np.linspace(start_amp, 0, decay_samples) else: # Sustained envelope = np.linspace(start_amp, 0, num_samples) # --- Graded Note Smoothing --- # The level controls the length of the fade in/out. Max fade is 10ms. if smooth_notes_level > 0 and num_samples > 10: fade_length = int(fs * 0.01 * smooth_notes_level) fade_samples = min(fade_length, num_samples // 2) if fade_samples > 0: envelope[:fade_samples] *= np.linspace(0.5, 1.0, fade_samples) envelope[-fade_samples:] *= np.linspace(1.0, 0.0, fade_samples) # Apply envelope to the (potentially combined) waveform note_waveform *= envelope start_sample = int(note.start * fs) end_sample = start_sample + num_samples if end_sample > waveform.shape[1]: end_sample = waveform.shape[1] note_waveform = note_waveform[:end_sample-start_sample] # Add the mono note waveform to the stereo buffer with panning waveform[0, start_sample:end_sample] += note_waveform * pan_l waveform[1, start_sample:end_sample] += note_waveform * pan_r return waveform # Returns a (2, N) numpy array def analyze_midi_velocity(midi_path): midi = pretty_midi.PrettyMIDI(midi_path) all_velocities = [] print(f"Analyzing velocity for MIDI: {midi_path}") for i, instrument in enumerate(midi.instruments): velocities = [note.velocity for note in instrument.notes] all_velocities.extend(velocities) if velocities: print(f"Instrument {i} ({instrument.name}):") print(f" Notes count: {len(velocities)}") print(f" Velocity min: {min(velocities)}") print(f" Velocity max: {max(velocities)}") print(f" Velocity mean: {np.mean(velocities):.2f}") else: print(f"Instrument {i} ({instrument.name}): no notes found.") if all_velocities: print("\nOverall MIDI velocity stats:") print(f" Total notes: {len(all_velocities)}") print(f" Velocity min: {min(all_velocities)}") print(f" Velocity max: {max(all_velocities)}") print(f" Velocity mean: {np.mean(all_velocities):.2f}") else: print("No notes found in this MIDI.") def scale_instrument_velocity(instrument, scale=0.8): for note in instrument.notes: note.velocity = max(1, min(127, int(note.velocity * scale))) def normalize_loudness(audio_data, sample_rate, target_lufs=-23.0): """ Normalizes the audio data to a target integrated loudness (LUFS). This provides more consistent perceived volume than peak normalization. Args: audio_data (np.ndarray): The audio signal. sample_rate (int): The sample rate of the audio. target_lufs (float): The target loudness in LUFS. Defaults to -23.0, a common standard for broadcast. Returns: np.ndarray: The loudness-normalized audio data. """ try: # 1. Measure the integrated loudness of the input audio meter = pyln.Meter(sample_rate) # create meter loudness = meter.integrated_loudness(audio_data) # 2. Calculate the gain needed to reach the target loudness # The gain is applied in the linear domain, so we convert from dB loudness_gain_db = target_lufs - loudness loudness_gain_linear = 10.0 ** (loudness_gain_db / 20.0) # 3. Apply the gain normalized_audio = audio_data * loudness_gain_linear # 4. Final safety check: peak normalize to prevent clipping, just in case # the loudness normalization results in peaks > 1.0 peak_val = np.max(np.abs(normalized_audio)) if peak_val > 1.0: normalized_audio /= peak_val print(f"Warning: Loudness normalization resulted in clipping. Audio was peak-normalized as a safeguard.") print(f"Audio normalized from {loudness:.2f} LUFS to target {target_lufs} LUFS.") return normalized_audio except Exception as e: print(f"Loudness normalization failed: {e}. Falling back to original audio.") return audio_data # ================================================================================================= # === MIDI Merging Function === # ================================================================================================= def merge_midis(midi_path_left, midi_path_right, output_path): """ Merges two MIDI files into a single MIDI file. This robust version iterates through ALL instruments in both MIDI files, ensuring no data is lost if the source files are multi-instrumental. It applies hard-left panning (Pan=0) to every instrument from the left MIDI and hard-right panning (Pan=127) to every instrument from the right MIDI. """ try: analyze_midi_velocity(midi_path_left) analyze_midi_velocity(midi_path_right) midi_left = pretty_midi.PrettyMIDI(midi_path_left) midi_right = pretty_midi.PrettyMIDI(midi_path_right) merged_midi = pretty_midi.PrettyMIDI() # --- Process ALL instruments from the left channel MIDI --- if midi_left.instruments: print(f"Found {len(midi_left.instruments)} instrument(s) in the left channel MIDI.") # Use a loop to iterate through every instrument for instrument in midi_left.instruments: scale_instrument_velocity(instrument, scale=0.8) # To avoid confusion, we can prefix the instrument name instrument.name = f"Left - {instrument.name if instrument.name else 'Instrument'}" # Create and add the Pan Left control change # Create a Control Change event for Pan (controller number 10). # Set its value to 0 for hard left panning. # Add it at the very beginning of the track (time=0.0). pan_left = pretty_midi.ControlChange(number=10, value=0, time=0.0) # Use insert() to ensure the pan event is the very first one instrument.control_changes.insert(0, pan_left) # Append the fully processed instrument to the merged MIDI merged_midi.instruments.append(instrument) # --- Process ALL instruments from the right channel MIDI --- if midi_right.instruments: print(f"Found {len(midi_right.instruments)} instrument(s) in the right channel MIDI.") # Use a loop here as well for instrument in midi_right.instruments: scale_instrument_velocity(instrument, scale=0.8) instrument.name = f"Right - {instrument.name if instrument.name else 'Instrument'}" # Create and add the Pan Right control change # Create a Control Change event for Pan (controller number 10). # Set its value to 127 for hard right panning. # Add it at the very beginning of the track (time=0.0). pan_right = pretty_midi.ControlChange(number=10, value=127, time=0.0) instrument.control_changes.insert(0, pan_right) merged_midi.instruments.append(instrument) merged_midi.write(output_path) print(f"Successfully merged all instruments and panned into '{os.path.basename(output_path)}'") analyze_midi_velocity(output_path) return output_path except Exception as e: print(f"Error merging MIDI files: {e}") # Fallback logic remains the same if os.path.exists(midi_path_left): print("Fallback: Using only the left channel MIDI.") return midi_path_left return None # ================================================================================================= # === Stage 1: Audio to MIDI Transcription Functions === # ================================================================================================= def TranscribePianoAudio(input_file): """ Transcribes a WAV or MP3 audio file of a SOLO PIANO performance into a MIDI file. This uses the ByteDance model. Args: input_file_path (str): The path to the input audio file. Returns: str: The file path of the generated MIDI file. """ print('=' * 70) print('STAGE 1: Starting Piano-Specific Transcription') print('=' * 70) # Generate a unique output filename for the MIDI fn = os.path.basename(input_file) fn1 = fn.split('.')[0] # Use os.path.join to create a platform-independent directory path output_dir = os.path.join("output", "transcribed_piano_") out_mid_path = os.path.join(output_dir, fn1 + '.mid') # Check for the directory's existence and create it if necessary if not os.path.exists(output_dir): os.makedirs(output_dir) print('-' * 70) print(f'Input file name: {fn}') print(f'Output MIDI path: {out_mid_path}') print('-' * 70) # Load audio using the utility function print('Loading audio...') (audio, _) = utilities.load_audio(input_file, sr=transcription_sample_rate, mono=True) print('Audio loaded successfully.') print('-' * 70) # Initialize the transcription model # Use 'cuda' if a GPU is available and configured, otherwise 'cpu' device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f'Loading transcriptor model... device= {device}') transcriptor = PianoTranscription(device=device, checkpoint_path="src/models/CRNN_note_F1=0.9677_pedal_F1=0.9186.pth") print('Transcriptor loaded.') print('-' * 70) # Perform transcription print('Transcribing audio to MIDI (Piano-Specific)...') # This function call saves the MIDI file to the specified path transcriptor.transcribe(audio, out_mid_path) print('Piano transcription complete.') print('=' * 70) # Return the path to the newly created MIDI file return out_mid_path def TranscribeGeneralAudio(input_file, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool): """ Transcribes a general audio file into a MIDI file using basic-pitch. This is suitable for various instruments and vocals. """ print('=' * 70) print('STAGE 1: Starting General Purpose Transcription') print('=' * 70) fn = os.path.basename(input_file) fn1 = fn.split('.')[0] output_dir = os.path.join("output", "transcribed_general_") out_mid_path = os.path.join(output_dir, fn1 + '.mid') os.makedirs(output_dir, exist_ok=True) print(f'Input file: {fn}\nOutput MIDI: {out_mid_path}') # --- Perform transcription using basic-pitch --- print('Transcribing audio to MIDI (General Purpose)...') # The predict function handles audio loading internally model_output, midi_data, note_events = basic_pitch.inference.predict( audio_path=input_file, model_or_model_path=ICASSP_2022_MODEL_PATH, onset_threshold=onset_thresh, frame_threshold=frame_thresh, minimum_note_length=min_note_len, minimum_frequency=min_freq, maximum_frequency=max_freq, infer_onsets=infer_onsets_bool, melodia_trick=melodia_trick_bool, multiple_pitch_bends=multiple_bends_bool ) # --- Save the MIDI file --- midi_data.write(out_mid_path) print('General transcription complete.') print('=' * 70) return out_mid_path # ================================================================================================= # === Stage 2: MIDI Transformation and Rendering Function === # ================================================================================================= def Render_MIDI(input_midi_path, render_type, soundfont_bank, render_sample_rate, render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align, render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums, # --- 8-bit synth params --- s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s, s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, s8bit_bass_boost_level, s8bit_smooth_notes_level, s8bit_continuous_vibrato_level, s8bit_noise_level, s8bit_distortion_level, s8bit_fm_modulation_depth, s8bit_fm_modulation_rate ): """ Processes and renders a MIDI file according to user-defined settings. Can render using SoundFonts or a custom 8-bit synthesizer. Args: input_midi_path (str): The path to the input MIDI file. All other arguments are rendering options from the Gradio UI. Returns: A tuple containing all the output elements for the Gradio UI. """ print('*' * 70) print('STAGE 2: Starting MIDI Rendering') print('*' * 70) # --- File and Settings Setup --- fn = os.path.basename(input_midi_path) fn1 = fn.split('.')[0] # Use os.path.join to create a platform-independent directory path output_dir = os.path.join("output", "rendered_midi") if not os.path.exists(output_dir): os.makedirs(output_dir) # Now, join the clean directory path with the filename new_fn_path = os.path.join(output_dir, fn1 + '_rendered.mid') try: with open(input_midi_path, 'rb') as f: fdata = f.read() input_midi_md5hash = hashlib.md5(fdata).hexdigest() except FileNotFoundError: # Handle cases where the input file might not exist print(f"Error: Input MIDI file not found at {input_midi_path}") return [None] * 7 # Return empty values for all outputs print('=' * 70) print('Requested settings:') print(f'Input MIDI file name: {fn}') print(f'Input MIDI md5 hash: {input_midi_md5hash}') print('-' * 70) print(f'Render type: {render_type}') print(f'Soundfont bank: {soundfont_bank}') print(f'Audio render sample rate: {render_sample_rate}') # ... (add other print statements for settings if needed) print('=' * 70) # --- MIDI Processing using TMIDIX --- print('Processing MIDI... Please wait...') raw_score = MIDI.midi2single_track_ms_score(fdata) # call the function and store the returned list in a variable. processed_scores = TMIDIX.advanced_score_processor(raw_score, return_enhanced_score_notes=True, apply_sustain=render_with_sustains) # check if the returned list is empty. This happens when transcription finds no notes. # This check prevents the 'IndexError: list index out of range'. if not processed_scores: # If it is empty, print a warning and return a user-friendly error message. print("Warning: MIDI file contains no processable notes.") # The number of returned values must match the function's expected output. # We return a tuple with empty or placeholder values for all 7 output components. return ("N/A", fn1, "MIDI file contains no notes.", None, None, None, "No notes found.") # If the list is not empty, it is now safe to get the first element. escore = processed_scores[0] # Handle cases where the MIDI might not contain any notes if not escore: print("Warning: MIDI file contains no processable notes.") return ("N/A", fn1, "MIDI file contains no notes.",None, None, None, "No notes found.") # This line will now work correctly because merge_misaligned_notes is guaranteed to be an integer. if merge_misaligned_notes > 0: escore = TMIDIX.merge_escore_notes(escore, merge_threshold=merge_misaligned_notes) escore = TMIDIX.augment_enhanced_score_notes(escore, timings_divider=1) first_note_index = [e[0] for e in raw_score[1]].index('note') cscore = TMIDIX.chordify_score([1000, escore]) meta_data = raw_score[1][:first_note_index] + [escore[0]] + [escore[-1]] + [raw_score[1][-1]] aux_escore_notes = TMIDIX.augment_enhanced_score_notes(escore, sort_drums_last=True) song_description = TMIDIX.escore_notes_to_text_description(aux_escore_notes) print('Done!') print('=' * 70) print('Input MIDI metadata:', meta_data[:5]) print('=' * 70) print('Input MIDI song description:', song_description) print('=' * 70) print('Processing...Please wait...') # A deep copy of the score to be modified output_score = copy.deepcopy(escore) # Apply transformations based on render_type if render_type == "Extract melody": output_score = TMIDIX.add_melody_to_enhanced_score_notes(escore, return_melody=True) output_score = TMIDIX.recalculate_score_timings(output_score) elif render_type == "Flip": output_score = TMIDIX.flip_enhanced_score_notes(escore) elif render_type == "Reverse": output_score = TMIDIX.reverse_enhanced_score_notes(escore) elif render_type == 'Repair Durations': output_score = TMIDIX.fix_escore_notes_durations(escore, min_notes_gap=0) elif render_type == 'Repair Chords': fixed_cscore = TMIDIX.advanced_check_and_fix_chords_in_chordified_score(cscore)[0] output_score = TMIDIX.flatten(fixed_cscore) elif render_type == 'Remove Duplicate Pitches': output_score = TMIDIX.remove_duplicate_pitches_from_escore_notes(escore) elif render_type == "Add Drum Track": nd_escore = [e for e in escore if e[3] != 9] nd_escore = TMIDIX.augment_enhanced_score_notes(nd_escore) output_score = TMIDIX.advanced_add_drums_to_escore_notes(nd_escore) for e in output_score: e[1] *= 16 e[2] *= 16 print('MIDI processing complete.') print('=' * 70) # --- Final Processing and Patching --- if render_type != "Render as-is": print('Applying final adjustments (transpose, align, patch)...') if custom_render_patch != -1: # -1 indicates no change for e in output_score: if e[3] != 9: # not a drum channel e[6] = custom_render_patch if render_transpose_value != 0: output_score = TMIDIX.transpose_escore_notes(output_score, render_transpose_value) if render_transpose_to_C4: output_score = TMIDIX.transpose_escore_notes_to_pitch(output_score, 60) # C4 is MIDI pitch 60 if render_align == "Start Times": output_score = TMIDIX.recalculate_score_timings(output_score) output_score = TMIDIX.align_escore_notes_to_bars(output_score) elif render_align == "Start Times and Durations": output_score = TMIDIX.recalculate_score_timings(output_score) output_score = TMIDIX.align_escore_notes_to_bars(output_score, trim_durations=True) elif render_align == "Start Times and Split Durations": output_score = TMIDIX.recalculate_score_timings(output_score) output_score = TMIDIX.align_escore_notes_to_bars(output_score, split_durations=True) if render_type == "Longest Repeating Phrase": zscore = TMIDIX.recalculate_score_timings(output_score) lrno_score = TMIDIX.escore_notes_lrno_pattern_fast(zscore) if lrno_score is not None: output_score = lrno_score else: output_score = TMIDIX.recalculate_score_timings(TMIDIX.escore_notes_middle(output_score, 50)) if render_type == "Multi-Instrumental Summary": zscore = TMIDIX.recalculate_score_timings(output_score) c_escore_notes = TMIDIX.compress_patches_in_escore_notes_chords(zscore) if len(c_escore_notes) > 128: cmatrix = TMIDIX.escore_notes_to_image_matrix(c_escore_notes, filter_out_zero_rows=True, filter_out_duplicate_rows=True) smatrix = TPLOTS.square_image_matrix(cmatrix, num_pca_components=max(1, min(5, len(c_escore_notes) // 128))) output_score = TMIDIX.image_matrix_to_original_escore_notes(smatrix) for o in output_score: o[1] *= 250 o[2] *= 250 if render_output_as_solo_piano: output_score = TMIDIX.solo_piano_escore_notes(output_score, keep_drums=(not render_remove_drums)) if render_remove_drums and not render_output_as_solo_piano: output_score = TMIDIX.strip_drums_from_escore_notes(output_score) if render_type == "Solo Piano Summary": sp_escore_notes = TMIDIX.solo_piano_escore_notes(output_score, keep_drums=False) zscore = TMIDIX.recalculate_score_timings(sp_escore_notes) if len(zscore) > 128: bmatrix = TMIDIX.escore_notes_to_binary_matrix(zscore) cmatrix = TMIDIX.compress_binary_matrix(bmatrix, only_compress_zeros=True) smatrix = TPLOTS.square_binary_matrix(cmatrix, interpolation_order=max(1, min(5, len(zscore) // 128))) output_score = TMIDIX.binary_matrix_to_original_escore_notes(smatrix) for o in output_score: o[1] *= 200 o[2] *= 200 print('Final adjustments complete.') print('=' * 70) # --- Saving Processed MIDI File --- # Save the transformed MIDI data SONG, patches, _ = TMIDIX.patch_enhanced_score_notes(output_score) # The underlying function mistakenly adds a '.mid' extension. # We must pass the path without the extension to compensate. path_without_ext = new_fn_path.rsplit('.mid', 1)[0] TMIDIX.Tegridy_ms_SONG_to_MIDI_Converter(SONG, output_signature = 'Integrated-MIDI-Processor', output_file_name = path_without_ext, track_name='Processed Track', list_of_MIDI_patches=patches ) midi_to_render_path = new_fn_path else: # If "Render as-is", use the original MIDI data with open(new_fn_path, 'wb') as f: f.write(fdata) midi_to_render_path = new_fn_path # --- Audio Rendering --- print('Rendering final audio...') # Select sample rate srate = int(render_sample_rate) # --- Conditional Rendering Logic --- if soundfont_bank == SYNTH_8_BIT_LABEL: print("Using 8-bit style synthesizer...") try: # Load the MIDI file with pretty_midi for manual synthesis midi_data_for_synth = pretty_midi.PrettyMIDI(midi_to_render_path) # Synthesize the waveform # --- Passing new FX parameters to the synthesis function --- audio = synthesize_8bit_style( midi_data_for_synth, s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s, s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, s8bit_bass_boost_level, fs=srate, smooth_notes_level=s8bit_smooth_notes_level, continuous_vibrato_level=s8bit_continuous_vibrato_level, noise_level=s8bit_noise_level, distortion_level=s8bit_distortion_level, fm_modulation_depth=s8bit_fm_modulation_depth, fm_modulation_rate=s8bit_fm_modulation_rate ) # Normalize and prepare for Gradio peak_val = np.max(np.abs(audio)) if peak_val > 0: audio /= peak_val # Transpose from (2, N) to (N, 2) and convert to int16 for Gradio audio_out = (audio.T * 32767).astype(np.int16) except Exception as e: print(f"Error during 8-bit synthesis: {e}") return [None] * 7 else: print(f"Using SoundFont: {soundfont_bank}") # Get the full path from the global dictionary soundfont_path = soundfonts_dict.get(soundfont_bank) # Select soundfont if not soundfont_path or not os.path.exists(soundfont_path): # Error handling in case the selected file is not found error_msg = f"SoundFont '{soundfont_bank}' not found!" print(f"ERROR: {error_msg}") # Fallback to the first available soundfont if possible if soundfonts_dict: fallback_key = list(soundfonts_dict.keys())[0] soundfont_path = soundfonts_dict[fallback_key] print(f"Falling back to '{fallback_key}'.") else: # If no soundfonts are available at all, raise an error raise gr.Error("No SoundFonts are available for rendering!") with open(midi_to_render_path, 'rb') as f: midi_file_content = f.read() audio_out = midi_to_colab_audio(midi_file_content, soundfont_path=soundfont_path, # Use the dynamically found path sample_rate=srate, output_for_gradio=True ) print('Audio rendering complete.') print('=' * 70) # --- Preparing Outputs for Gradio --- with open(midi_to_render_path, 'rb') as f: new_md5_hash = hashlib.md5(f.read()).hexdigest() output_plot = TPLOTS.plot_ms_SONG(output_score, plot_title=f"Score of {fn1}", return_plt=True) output_midi_summary = str(meta_data) return new_md5_hash, fn1, output_midi_summary, midi_to_render_path, (srate, audio_out), output_plot, song_description def analyze_midi_features(midi_data): """ Analyzes a PrettyMIDI object to extract musical features for parameter recommendation. Args: midi_data (pretty_midi.PrettyMIDI): The MIDI data to analyze. Returns: dict or None: A dictionary containing features, or None if the MIDI is empty. Features: 'note_count', 'instruments_count', 'duration', 'note_density', 'avg_velocity', 'pitch_range'. """ all_notes = [note for instrument in midi_data.instruments for note in instrument.notes] note_count = len(all_notes) # Return None if the MIDI file has no notes to analyze. if note_count == 0: return None duration = midi_data.get_end_time() # Avoid division by zero for empty-duration MIDI files. if duration == 0: note_density = 0 else: note_density = note_count / duration # --- Calculate new required features --- avg_velocity = sum(note.velocity for note in all_notes) / note_count avg_pitch = sum(note.pitch for note in all_notes) / note_count avg_note_length = sum(note.end - note.start for note in all_notes) / note_count # Calculate pitch range if note_count > 1: min_pitch = min(note.pitch for note in all_notes) max_pitch = max(note.pitch for note in all_notes) pitch_range = max_pitch - min_pitch else: pitch_range = 0 return { 'note_count': note_count, 'instruments_count': len(midi_data.instruments), 'duration': duration, 'note_density': note_density, # Notes per second 'avg_velocity': avg_velocity, 'pitch_range': pitch_range, # In semitones 'avg_pitch': avg_pitch, 'avg_note_length': avg_note_length, } def determine_waveform_type(features): """ Determines the best waveform type based on analyzed MIDI features. - Square: Best for most general-purpose, bright melodies. - Sawtooth: Best for intense, heavy, or powerful leads and basses. - Triangle: Best for soft, gentle basses or flute-like sounds. Args: features (dict): The dictionary of features from analyze_midi_features. Returns: str: The recommended waveform type ('Square', 'Sawtooth', or 'Triangle'). """ # 1. Check for conditions that strongly suggest a Triangle wave (soft bassline) # MIDI Pitch 52 is ~G#3. If the average pitch is below this, it's likely a bass part. # If notes are long and the pitch range is narrow, it confirms a simple, melodic bassline. if features['avg_pitch'] <= 52 and features['avg_note_length'] >= 0.3 and features['pitch_range'] < 12: return "Triangle" # 2. Check for conditions that suggest a Sawtooth wave (intense/complex part) # High note density or a very wide pitch range often indicates an aggressive lead or a complex solo. # The sawtooth's rich harmonics are perfect for this. if features['note_density'] >= 6 or features['pitch_range'] >= 18: return "Sawtooth" # 3. Default to the most versatile waveform: Square return "Square" def recommend_8bit_params(midi_data, default_preset): """ Recommends 8-bit synthesizer parameters using a unified, factor-based model. This "AI" generates a sound profile based on normalized musical features. Args: midi_data (pretty_midi.PrettyMIDI): The MIDI data to analyze. default_preset (dict): A fallback preset if analysis fails. Returns: dict: A dictionary of recommended synthesizer parameters. """ features = analyze_midi_features(midi_data) if features is None: # Return a default preset if MIDI is empty or cannot be analyzed return default_preset # --- Rule-based Parameter Recommendation --- params = {} # --- 1. Core Timbre Selection --- # Intelligent Waveform Selection params['waveform_type'] = determine_waveform_type(features) # Determine pulse width *after* knowing the waveform. # This only applies if the waveform is Square. if params['waveform_type'] == 'Square': # For Square waves, use pitch complexity to decide pulse width. # Complex melodies get a thinner sound (0.3) for clarity. # Simpler melodies get a fuller sound (0.5). params['pulse_width'] = 0.3 if features['pitch_range'] > 30 else 0.5 else: # For Sawtooth or Triangle, pulse width is not applicable. Set a default. params['pulse_width'] = 0.5 # --- 2. Envelope and Rhythm --- # Determine envelope type based on note density is_plucky = features['note_density'] > 10 params['envelope_type'] = 'Plucky (AD Envelope)' if is_plucky else 'Sustained (Full Decay)' params['decay_time_s'] = 0.15 if is_plucky else 0.4 # --- 3. Modulation (Vibrato) --- # Vibrato depth and rate based on velocity and density params['vibrato_depth'] = min(max((features['avg_velocity'] - 60) / 20, 0), 10) # More velocity = more depth if features['note_density'] > 12: params['vibrato_rate'] = 7.0 # Very fast music -> frantic vibrato elif features['note_density'] > 6: params['vibrato_rate'] = 5.0 # Moderately fast music -> standard vibrato else: params['vibrato_rate'] = 3.0 # Slow music -> gentle vibrato # --- 4. Progressive/Graded Parameters using Normalization --- # Smooth notes level (0.0 to 1.0): More smoothing for denser passages. # Effective range: 3 to 8 notes/sec. params['smooth_notes_level'] = min(max((features['note_density'] - 3) / 5.0, 0.0), 1.0) # Smoothen notes in denser passages # Continuous vibrato level (0.0 to 1.0): Less dense passages get more lyrical, continuous vibrato. # Effective range: 5 to 10 notes/sec. (Inverted) params['continuous_vibrato_level'] = 1.0 - min(max((features['note_density'] - 5) / 5.0, 0.0), 1.0) # Lyrical (less dense) music gets connected vibrato # Noise level (0.0 to 0.1): Higher velocity passages get more "air" or "grit". # Effective range: velocity 50 to 90. params['noise_level'] = min(max((features['avg_velocity'] - 50) / 40.0, 0.0), 1.0) * 0.1 # Distortion level (0.0 to 0.1): Shorter notes get more distortion for punch. # Effective range: note length 0.5s down to 0.25s. (Inverted) if features['avg_note_length'] < 0.25: # Short, staccato notes params['distortion_level'] = 0.1 elif features['avg_note_length'] < 0.5: # Medium length notes params['distortion_level'] = 0.05 else: # Long, sustained notes params['distortion_level'] = 0.0 # Progressive FM modulation based on a combined complexity factor. # Normalizes note density and pitch range to a 0-1 scale. density_factor = min(max((features['note_density'] - 5) / 15, 0), 1) # Effective range 5-20 notes/sec range_factor = min(max((features['pitch_range'] - 15) / 30, 0), 1) # Effective range 15-45 semitones # The overall complexity is the average of these two factors. complexity_factor = (density_factor + range_factor) / 2 params['fm_modulation_depth'] = round(0.3 * complexity_factor, 3) params['fm_modulation_rate'] = round(200 * complexity_factor, 1) # Non-linear bass boost # REFINED LOGIC: Non-linear bass boost based on instrument count. # More instruments lead to less bass boost to avoid a muddy mix, # while solo or duo arrangements get a significant boost to sound fuller. # The boost level has a floor of 0.2 and a ceiling of 1.0. params['bass_boost_level'] = max(0.2, 1.0 - (features['instruments_count'] - 1) * 0.15) # Round all float values for cleaner output for key, value in params.items(): if isinstance(value, float): params[key] = round(value, 3) return params # ================================================================================================= # === Main Application Logic === # ================================================================================================= # --- Helper function to encapsulate the transcription pipeline for a single audio file --- def _transcribe_stem(audio_path, base_name, temp_dir, # Pass all transcription-related parameters enable_stereo, transcription_method, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool): """ Takes a single audio file path and runs the full transcription pipeline on it. This includes stereo/mono handling and normalization. Returns the file path of the resulting transcribed MIDI. """ print(f"\n--- Transcribing Stem: {os.path.basename(audio_path)} ---") # Load the audio stem to process it audio_data, native_sample_rate = librosa.load(audio_path, sr=None, mono=False) if enable_stereo and audio_data.ndim == 2 and audio_data.shape[0] == 2: print("Stereo processing enabled for stem.") left_channel_np = audio_data[0] right_channel_np = audio_data[1] normalized_left = normalize_loudness(left_channel_np, native_sample_rate) normalized_right = normalize_loudness(right_channel_np, native_sample_rate) temp_left_path = os.path.join(temp_dir, f"{base_name}_left.flac") temp_right_path = os.path.join(temp_dir, f"{base_name}_right.flac") sf.write(temp_left_path, normalized_left, native_sample_rate) sf.write(temp_right_path, normalized_right, native_sample_rate) print(f"Saved left channel to: {temp_left_path}") print(f"Saved right channel to: {temp_right_path}") print("Transcribing left and right channel...") if transcription_method == "General Purpose": midi_path_left = TranscribeGeneralAudio(temp_left_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool) midi_path_right = TranscribeGeneralAudio(temp_right_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool) else: # Piano-Specific midi_path_left = TranscribePianoAudio(temp_left_path) midi_path_right = TranscribePianoAudio(temp_right_path) if midi_path_left and midi_path_right: merged_midi_path = os.path.join(temp_dir, f"{base_name}_merged.mid") return merge_midis(midi_path_left, midi_path_right, merged_midi_path) elif midi_path_left: print("Warning: Right channel transcription failed. Using left channel only.") return midi_path_left elif midi_path_right: print("Warning: Left channel transcription failed. Using right channel only.") return midi_path_right else: print(f"Warning: Stereo transcription failed for stem {base_name}.") return None else: print("Mono processing for stem.") mono_signal_np = np.mean(audio_data, axis=0) if audio_data.ndim > 1 else audio_data normalized_mono = normalize_loudness(mono_signal_np, native_sample_rate) temp_mono_path = os.path.join(temp_dir, f"{base_name}_mono.flac") sf.write(temp_mono_path, normalized_mono, native_sample_rate) if transcription_method == "General Purpose": return TranscribeGeneralAudio(temp_mono_path, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool) else: return TranscribePianoAudio(temp_mono_path) # --- The main processing function is now significantly refactored --- def process_and_render_file(input_file, # --- Pass the preset selector value --- s8bit_preset_selector, separate_vocals, remerge_vocals, transcription_target, # --- ADDED: New parameter from UI --- transcribe_both_stems, # --- Transcription params --- enable_stereo_processing, transcription_method, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool, # --- MIDI rendering params --- render_type, soundfont_bank, render_sample_rate, render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align, render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums, # --- 8-bit synth params --- s8bit_waveform_type, s8bit_envelope_type, s8bit_decay_time_s, s8bit_pulse_width, s8bit_vibrato_rate, s8bit_vibrato_depth, s8bit_bass_boost_level, s8bit_smooth_notes_level, s8bit_continuous_vibrato_level, s8bit_noise_level, s8bit_distortion_level, s8bit_fm_modulation_depth, s8bit_fm_modulation_rate ): """ Main function to handle file processing. It determines the file type and calls the appropriate functions for transcription and/or rendering based on user selections. """ start_time = reqtime.time() if input_file is None: # Return a list of updates to clear all output fields and UI controls return [gr.update(value=None)] * (7 + 13) # 7 results + 13 synth controls # The input_file from gr.Audio(type="filepath") is now the direct path (a string), # not a temporary file object. We no longer need to access the .name attribute. input_file_path = input_file filename = os.path.basename(input_file_path) print(f"Processing new file: {filename}") # This will store the other part if separation is performed other_part_tensor = None other_part_sr = None # --- Step 1: Check file type and transcribe if necessary --- if filename.lower().endswith(('.mid', '.midi', '.kar')): print("MIDI file detected. Cannot perform vocal separation. Proceeding directly to rendering.") midi_path_for_rendering = input_file_path else: print("Audio file detected. Starting pre-processing...") # --- Robust audio loading with ffmpeg fallback --- try: # Try loading directly with torchaudio (efficient for supported formats). # This works for formats like WAV, MP3, FLAC, OGG, etc. print("Attempting to load audio with torchaudio...") audio_tensor, native_sample_rate = torchaudio.load(input_file_path) print("Torchaudio loading successful.") except Exception as e: print(f"Torchaudio failed: {e}. Attempting fallback with ffmpeg...") try: # Use ffmpeg to convert the audio to WAV in-memory, then load the bytes. out, err = ( ffmpeg .input(input_file_path) .output('pipe:', format='flac') .run(capture_stdout=True, capture_stderr=True) ) # Load the WAV data from the in-memory buffer audio_tensor, native_sample_rate = torchaudio.load(io.BytesIO(out)) print("FFmpeg fallback successful.") except Exception as ffmpeg_err: # If both direct loading and ffmpeg fallback fail, raise an error. raise gr.Error(f"Failed to load audio file with both torchaudio and ffmpeg.\n" f"Torchaudio error: {e}\n" f"FFmpeg error: {ffmpeg_err.decode() if isinstance(ffmpeg_err, bytes) else ffmpeg_err}") base_name = os.path.splitext(filename)[0] temp_dir = "output/temp_transcribe" os.makedirs(temp_dir, exist_ok=True) # --- Demucs Vocal Separation Logic, now decides which stem to process --- if not separate_vocals: # --- Standard Workflow: Transcribe the original full audio --- print("Standard workflow: No vocal separation.") audio_to_transcribe_path = os.path.join(temp_dir, f"{base_name}_original.flac") torchaudio.save(audio_to_transcribe_path, audio_tensor, native_sample_rate) midi_path_for_rendering = _transcribe_stem( audio_to_transcribe_path, f"{base_name}_original", temp_dir, enable_stereo_processing, transcription_method, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool ) else: # --- Vocal Separation Workflow --- if demucs_model is None: raise gr.Error("Demucs model is not loaded. Cannot separate vocals.") # Convert to a common format (stereo, float32) that demucs expects audio_tensor = convert_audio(audio_tensor, native_sample_rate, demucs_model.samplerate, demucs_model.audio_channels) if torch.cuda.is_available(): audio_tensor = audio_tensor.cuda() print("Separating audio with Demucs... This may take some time.") # --- Wrap the model call in a no_grad() context --- with torch.no_grad(): all_stems = apply_model( demucs_model, audio_tensor[None], # The input shape is [batch, channels, samples] device='cuda' if torch.cuda.is_available() else 'cpu', progress=True, )[0] # Remove the batch dimension from the output # --- Clear CUDA cache immediately after use --- if torch.cuda.is_available(): torch.cuda.empty_cache() print("CUDA cache cleared.") # --- Robust stem handling to prevent CUDA errors --- # Instead of complex GPU indexing, we create a dictionary of stems on the CPU. # This is safer and more robust across different hardware. sources = {} for i, source_name in enumerate(demucs_model.sources): sources[source_name] = all_stems[i] vocals_tensor = sources['vocals'] # Sum the other stems to create the accompaniment. # This loop is safer than a single complex indexing operation. accompaniment_tensor = torch.zeros_like(vocals_tensor) for source_name, stem_tensor in sources.items(): if source_name != 'vocals': accompaniment_tensor += stem_tensor # --- Save both stems to temporary files --- vocals_path = os.path.join(temp_dir, f"{base_name}_vocals.flac") accompaniment_path = os.path.join(temp_dir, f"{base_name}_accompaniment.flac") torchaudio.save(vocals_path, vocals_tensor.cpu(), demucs_model.samplerate) torchaudio.save(accompaniment_path, accompaniment_tensor.cpu(), demucs_model.samplerate) # --- Determine which stem is the primary target and which is the "other part" --- primary_target_path = vocals_path if transcription_target == "Transcribe Vocals" else accompaniment_path other_part_path = accompaniment_path if transcription_target == "Transcribe Vocals" else vocals_path # Store the audio tensor of the "other part" for potential audio re-merging other_part_tensor = accompaniment_tensor if transcription_target == "Transcribe Vocals" else vocals_tensor other_part_sr = demucs_model.samplerate print("Separation complete.") # --- Main Branching Logic: Transcribe one or both stems --- if not transcribe_both_stems: print(f"Transcribing primary target only: {os.path.basename(primary_target_path)}") midi_path_for_rendering = _transcribe_stem( primary_target_path, os.path.splitext(os.path.basename(primary_target_path))[0], temp_dir, enable_stereo_processing, transcription_method, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool ) else: print("Transcribing BOTH stems and merging the MIDI results.") # Transcribe the primary target midi_path_primary = _transcribe_stem( primary_target_path, os.path.splitext(os.path.basename(primary_target_path))[0], temp_dir, enable_stereo_processing, transcription_method, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool ) # Transcribe the other part midi_path_other = _transcribe_stem( other_part_path, os.path.splitext(os.path.basename(other_part_path))[0], temp_dir, enable_stereo_processing, transcription_method, onset_thresh, frame_thresh, min_note_len, min_freq, max_freq, infer_onsets_bool, melodia_trick_bool, multiple_bends_bool ) # Merge the two resulting MIDI files if midi_path_primary and midi_path_other: final_merged_midi_path = os.path.join(temp_dir, f"{base_name}_full_transcription.mid") print(f"Merging transcribed MIDI files into {os.path.basename(final_merged_midi_path)}") # A more robust MIDI merge is needed here primary_midi = pretty_midi.PrettyMIDI(midi_path_primary) other_midi = pretty_midi.PrettyMIDI(midi_path_other) # Add all instruments from the other midi to the primary one for instrument in other_midi.instruments: instrument.name = f"Other - {instrument.name}" # Rename to avoid confusion primary_midi.instruments.append(instrument) primary_midi.write(final_merged_midi_path) midi_path_for_rendering = final_merged_midi_path elif midi_path_primary: print("Warning: Transcription of the 'other' part failed. Using primary transcription only.") midi_path_for_rendering = midi_path_primary else: raise gr.Error("Transcription of the primary target failed. Aborting.") # --- Step 2: Render the FINAL MIDI file with selected options --- # --- Auto-Recommendation Logic --- # Store the original parameters from the UI sliders into a dictionary. # The keys in this dictionary match the keys returned by recommend_8bit_params. synth_params = { 'waveform_type': s8bit_waveform_type, 'pulse_width': s8bit_pulse_width, 'envelope_type': s8bit_envelope_type, 'decay_time_s': s8bit_decay_time_s, 'vibrato_rate': s8bit_vibrato_rate, 'vibrato_depth': s8bit_vibrato_depth, 'bass_boost_level': s8bit_bass_boost_level, 'smooth_notes_level': s8bit_smooth_notes_level, 'continuous_vibrato_level': s8bit_continuous_vibrato_level, 'noise_level': s8bit_noise_level, 'distortion_level': s8bit_distortion_level, 'fm_modulation_depth': s8bit_fm_modulation_depth, 'fm_modulation_rate': s8bit_fm_modulation_rate, } # This variable will hold the values to update the UI sliders ui_updates = {} # If the user selected the auto-recommend option, override the parameters if s8bit_preset_selector == "Auto-Recommend (Analyze MIDI)": print("Auto-Recommendation is enabled. Analyzing MIDI features...") try: midi_to_analyze = pretty_midi.PrettyMIDI(midi_path_for_rendering) default_params = S8BIT_PRESETS[FALLBACK_PRESET_NAME] recommended_params = recommend_8bit_params(midi_to_analyze, default_params) print("Recommended parameters:", recommended_params) # Both the synthesis parameters and the UI update values are set to the recommendations synth_params.update(recommended_params) ui_updates = recommended_params.copy() # Use a copy for UI updates except Exception as e: print(f"Could not auto-recommend parameters: {e}. Using default values from UI.") print(f"Proceeding to render MIDI file: {os.path.basename(midi_path_for_rendering)}") # --- Correctly pass parameters to Render_MIDI --- # The Render_MIDI function expects positional arguments, not keyword arguments. # We must unpack the values from our synth_params dictionary in the correct order. results = Render_MIDI(midi_path_for_rendering, render_type, soundfont_bank, render_sample_rate, render_with_sustains, merge_misaligned_notes, custom_render_patch, render_align, render_transpose_value, render_transpose_to_C4, render_output_as_solo_piano, render_remove_drums, # Unpack the values from the dictionary as positional arguments synth_params['waveform_type'], synth_params['envelope_type'], synth_params['decay_time_s'], synth_params['pulse_width'], synth_params['vibrato_rate'], synth_params['vibrato_depth'], synth_params['bass_boost_level'], synth_params['smooth_notes_level'], synth_params['continuous_vibrato_level'], synth_params['noise_level'], synth_params['distortion_level'], synth_params['fm_modulation_depth'], synth_params['fm_modulation_rate'] ) # --- Vocal Re-merging Logic now uses the generic "other_part" --- # IMPORTANT: This only runs if we did NOT transcribe both stems. if separate_vocals and remerge_vocals and not transcribe_both_stems and other_part_tensor is not None: print(f"Re-merging the non-transcribed part with newly rendered music...") rendered_srate, rendered_music_int16 = results[4] rendered_music_float = rendered_music_int16.astype(np.float32) / 32767.0 rendered_music_tensor = torch.from_numpy(rendered_music_float).T if rendered_srate != other_part_sr: resampler = torchaudio.transforms.Resample(rendered_srate, other_part_sr) rendered_music_tensor = resampler(rendered_music_tensor) len_music = rendered_music_tensor.shape[1] len_other = other_part_tensor.shape[1] if len_music > len_other: padding = len_music - len_other other_part_tensor = torch.nn.functional.pad(other_part_tensor, (0, padding)) elif len_other > len_music: padding = len_other - len_music rendered_music_tensor = torch.nn.functional.pad(rendered_music_tensor, (0, padding)) merged_audio_tensor = rendered_music_tensor + other_part_tensor.cpu() max_abs = torch.max(torch.abs(merged_audio_tensor)) if max_abs > 1.0: merged_audio_tensor /= max_abs merged_audio_int16 = (merged_audio_tensor.T.numpy() * 32767).astype(np.int16) new_results = list(results) new_results[4] = (other_part_sr, merged_audio_int16) results = tuple(new_results) print("Re-merging complete.") print(f'Total processing time: {(reqtime.time() - start_time):.2f} sec') print('*' * 70) # --- Prepare the final return value for Gradio --- # This list defines the order of UI components to be updated. # IT MUST MATCH THE ORDER IN `s8bit_updater_outputs` IN THE MAIN BLOCK. param_order = [ 'waveform_type', 'pulse_width', 'envelope_type', 'decay_time_s', 'vibrato_rate', 'vibrato_depth', 'bass_boost_level', 'smooth_notes_level', 'continuous_vibrato_level', 'noise_level', 'distortion_level', 'fm_modulation_depth', 'fm_modulation_rate' ] final_ui_updates = [] if ui_updates: # If auto-recommendation was successful # We have new values, so we create a list of these values in the correct order. for param in param_order: final_ui_updates.append(ui_updates.get(param)) else: # No auto-recommendation, so we tell Gradio not to change the UI. # We send a gr.update() for each UI component. for _ in param_order: final_ui_updates.append(gr.update()) # The final return is a combination of the result values and the UI update values. return list(results) + final_ui_updates # ================================================================================================= # === Gradio UI Setup === # ================================================================================================= if __name__ == "__main__": # Initialize the app: download model (if needed) and apply patches # Set to False if you don't have 'requests' or 'tqdm' installed initialize_app() # --- Prepare soundfonts and make the map globally accessible --- global soundfonts_dict # On application start, download SoundFonts from Hugging Face Hub if they don't exist. soundfonts_dict = prepare_soundfonts() print(f"Found {len(soundfonts_dict)} local SoundFonts.") if not soundfonts_dict: print("\nWARNING: No SoundFonts were found or could be downloaded.") print("Rendering with SoundFonts will fail. Only the 8-bit synthesizer will be available.") # --- Pre-load the Demucs model on startup for efficiency --- print("Loading Demucs model (htdemucs_ft), this may take a moment on first run...") try: demucs_model = get_model(name='htdemucs_ft') if torch.cuda.is_available(): demucs_model = demucs_model.cuda() print("Demucs model loaded successfully.") except Exception as e: print(f"Warning: Could not load Demucs model. Vocal separation will not be available. Error: {e}") demucs_model = None # --- Define a constant for the fallback preset name --- # This prevents errors if the preset name is changed in the dictionary. FALLBACK_PRESET_NAME = "Generic Chiptune Loop" # --- Data structure for 8-bit synthesizer presets --- # Comprehensive preset dictionary with new FX parameters for all presets # Comprehensive preset dictionary including new JRPG and Handheld classics # Note: Vibrato depth is mapped to a representative value on the 0-50 Hz slider. S8BIT_PRESETS = { # --- Classic Chiptune --- "Mario (Super Mario Bros / スーパーマリオブラザーズ)": { # Description: A bright square wave with a per-note vibrato, producing the classic bouncy platformer sound. 'waveform_type': 'Square', 'pulse_width': 0.3, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25, 'vibrato_rate': 5.0, 'vibrato_depth': 5, 'smooth_notes_level': 0.8, 'continuous_vibrato_level': 0.25, 'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, "Mega Man (Rockman / ロックマン)": { # Description: A thin, sharp square wave lead with fast vibrato, iconic for its driving, heroic melodies. 'waveform_type': 'Square', 'pulse_width': 0.2, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15, 'vibrato_rate': 6.0, 'vibrato_depth': 8, 'smooth_notes_level': 0.9, 'continuous_vibrato_level': 0.85, 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.05, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, "Zelda (The Legend of Zelda / ゼルダの伝説)": { # Description: The classic pure triangle wave lead, perfect for heroic and adventurous overworld themes. 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.3, 'vibrato_rate': 4.5, 'vibrato_depth': 4, 'smooth_notes_level': 0.9, 'continuous_vibrato_level': 0.9, 'bass_boost_level': 0.15, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, "Kirby's Bubbly Melody (Hoshi no Kirby / 星のカービィ)": { # Description: A soft, round square wave with a bouncy vibrato, creating a cheerful and adorable sound. 'waveform_type': 'Square', 'pulse_width': 0.4, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2, 'vibrato_rate': 6.0, 'vibrato_depth': 4, 'smooth_notes_level': 0.85, 'continuous_vibrato_level': 0.3, # Formerly False (0.0); adds a hint of continuity for more liveliness. 'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, "Pokémon (Game Boy Classics / ポケットモンスター)": { # Description: A full, friendly square wave sound, capturing the cheerful and adventurous spirit of early handheld RPGs. 'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22, 'vibrato_rate': 5.0, 'vibrato_depth': 5, 'smooth_notes_level': 0.9, 'continuous_vibrato_level': 0.9, 'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, "Castlevania (Akumajō Dracula / 悪魔城ドラキュラ)": { # Description: A sharp square wave with dramatic vibrato, ideal for fast, gothic, and baroque-inspired melodies. 'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18, 'vibrato_rate': 6.5, 'vibrato_depth': 6, 'smooth_notes_level': 0.85, 'continuous_vibrato_level': 0.85, 'bass_boost_level': 0.35, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, "Final Fantasy (Arpeggio / ファイナルファンタジー)": { # Description: A perfect, clean square wave with zero vibrato, creating the iconic, crystal-clear arpeggio sound. 'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.22, 'vibrato_rate': 5.0, 'vibrato_depth': 0, 'smooth_notes_level': 0.9, 'continuous_vibrato_level': 0.2, 'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, "ONI V (Wafu Mystic / ONI V 隠忍を継ぐ者)": { # Description: A solemn triangle wave with a slow, expressive vibrato, evoking the mysterious atmosphere of Japanese folklore. 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, 'vibrato_rate': 3.5, 'vibrato_depth': 3, 'smooth_notes_level': 0.9, 'continuous_vibrato_level': 0.85, 'bass_boost_level': 0.4, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, # --- Advanced System Impressions --- "Commodore 64 (SID Feel)": { # Description: (Impression) Uses high-speed, shallow vibrato to mimic the characteristic "buzzy" texture of the SID chip's PWM. 'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.25, 'vibrato_rate': 8.0, 'vibrato_depth': 4, 'smooth_notes_level': 0.9, 'continuous_vibrato_level': 0.3, 'bass_boost_level': 0.2, 'noise_level': 0.05, 'distortion_level': 0.1, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, "Megadrive/Genesis (FM Grit)": { # Description: (Impression) Uses FM, distortion, and noise to capture the gritty, metallic, and aggressive tone of the YM2612 chip. 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18, 'vibrato_rate': 0.0, 'vibrato_depth': 0, 'smooth_notes_level': 0.0, 'continuous_vibrato_level': 0.9, 'bass_boost_level': 0.4, 'noise_level': 0.1, 'distortion_level': 0.2, 'fm_modulation_depth': 0.2, 'fm_modulation_rate': 150 }, "PC-98 (Touhou Feel / 東方Project)": { # Description: (Impression) A very sharp square wave with fast FM, emulating the bright, high-energy leads of Japanese PC games. 'waveform_type': 'Square', 'pulse_width': 0.15, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.12, 'vibrato_rate': 7.5, 'vibrato_depth': 7, 'smooth_notes_level': 0.95, 'continuous_vibrato_level': 0.85, 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.1, 'fm_modulation_rate': 200 }, "Roland SC-88 (GM Vibe)": { # Description: (Impression) A clean, stable triangle wave with no effects, mimicking the polished, sample-based sounds of General MIDI. 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35, 'vibrato_rate': 0, 'vibrato_depth': 0, 'smooth_notes_level': 1.0, 'continuous_vibrato_level': 0.0, 'bass_boost_level': 0.1, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, # --- Action & Rock Leads --- "Falcom Ys (Rock Lead / イース)": { # Description: A powerful sawtooth with slight distortion, emulating the driving rock organ and guitar leads of action JRPGs. 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15, 'vibrato_rate': 5.5, 'vibrato_depth': 6, 'smooth_notes_level': 0.85, 'continuous_vibrato_level': 0.8, 'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.15, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, "Arcade Brawler Lead (Street Fighter / ストリートファイター)": { # Description: A gritty sawtooth lead with a hard attack, capturing the high-energy feel of classic fighting games. 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.15, 'vibrato_rate': 5.0, 'vibrato_depth': 6, 'smooth_notes_level': 0.8, 'continuous_vibrato_level': 0.7, 'bass_boost_level': 0.4, 'noise_level': 0.05, 'distortion_level': 0.1, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, "Rhythm Pop Lead (Rhythm Tengoku / リズム天国)": { # Description: A clean, round square wave perfect for the snappy, catchy feel of rhythm games. 'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.18, 'vibrato_rate': 4.5, 'vibrato_depth': 4, 'smooth_notes_level': 0.9, # Formerly True -> 1.0; slightly reduced for a bit more attack. 'continuous_vibrato_level': 0.8, # Formerly True -> 1.0; slightly weakened for more defined note transitions. 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, # --- Epic & Orchestral Pads --- "Dragon Quest (Orchestral Feel / ドラゴンクエスト)": { # Description: A pure triangle wave with a long decay, mimicking the grand, orchestral feel of a classical flute or string section. 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.6, 'vibrato_rate': 3.0, 'vibrato_depth': 4, 'smooth_notes_level': 0.9, 'continuous_vibrato_level': 0.9, 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, "Mystic Mana Pad (Secret of Mana / 聖剣伝説2)": { # Description: A warm, ethereal square wave pad with slow vibrato, capturing a feeling of fantasy and wonder. 'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5, 'vibrato_rate': 2.5, 'vibrato_depth': 4, 'smooth_notes_level': 1.0, 'continuous_vibrato_level': 0.95, 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, "Modern JRPG Pad (Persona / ペルソナ)": { # Description: A warm, stylish square wave pad, capturing the modern, pop/jazz-infused feel of the Persona series. 'waveform_type': 'Square', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5, 'vibrato_rate': 2.5, 'vibrato_depth': 4, 'smooth_notes_level': 1.0, 'continuous_vibrato_level': 0.95, 'bass_boost_level': 0.3, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, "Tactical Brass (Fire Emblem / ファイアーエムブレム)": { # Description: A powerful, sustained sawtooth emulating the bold, heroic synth-brass of Fire Emblem's tactical themes. 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, 'vibrato_rate': 3.5, 'vibrato_depth': 5, 'smooth_notes_level': 0.95, 'continuous_vibrato_level': 0.9, 'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, "Mecha & Tactics Brass (Super Robot Wars / スーパーロボット大戦)": { # Description: A powerful, sustained sawtooth emulating the bold, heroic synth-brass of strategy and mecha anime themes. 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, 'vibrato_rate': 3.5, 'vibrato_depth': 5, 'smooth_notes_level': 0.95, 'continuous_vibrato_level': 0.9, 'bass_boost_level': 0.5, 'noise_level': 0.1, 'distortion_level': 0.15, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, "Dark/Boss Atmosphere (Shin Megami Tensei / 真・女神転生)": { # Description: An aggressive sawtooth, inspired by the dark, rock-infused themes of SMT. 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.35, 'vibrato_rate': 7.0, 'vibrato_depth': 12, 'smooth_notes_level': 0.1, 'continuous_vibrato_level': 0.0, 'bass_boost_level': 0.4, 'noise_level': 0.15, 'distortion_level': 0.25, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, # --- Vocal Synthesis --- "8-Bit Vocal Lead": { # Description: A soft, sustained triangle wave with gentle vibrato to mimic a singing voice. 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.8, 'vibrato_rate': 5.5, 'vibrato_depth': 4, # Mapped from the suggested 0.15 range 'bass_boost_level': 0.1, 'smooth_notes_level': 0.85, 'continuous_vibrato_level': 0.9, 'noise_level': 0.02, 'distortion_level': 0.0, 'fm_modulation_depth': 0.05, 'fm_modulation_rate': 20 }, "8-Bit Male Vocal": { # Description: A deeper, fuller triangle wave with more bass and slower vibrato for a masculine feel. 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 1.0, 'vibrato_rate': 5.0, 'vibrato_depth': 3, # Mapped from the suggested 0.12 range 'bass_boost_level': 0.3, 'smooth_notes_level': 0.9, 'continuous_vibrato_level': 0.85, 'noise_level': 0.015, 'distortion_level': 0.0, 'fm_modulation_depth': 0.08, 'fm_modulation_rate': 25 }, "8-Bit Female Vocal": { # Description: A brighter, lighter triangle wave with faster vibrato and less bass for a feminine feel. 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.7, 'vibrato_rate': 6.0, 'vibrato_depth': 5, # Mapped from the suggested 0.18 range 'bass_boost_level': 0.05, 'smooth_notes_level': 0.85, 'continuous_vibrato_level': 0.92, 'noise_level': 0.025, 'distortion_level': 0.0, 'fm_modulation_depth': 0.04, 'fm_modulation_rate': 30 }, "Lo-Fi Vocal": { # Description: A gritty, noisy square wave with a short decay to simulate a low-resolution vocal sample. 'waveform_type': 'Square', 'pulse_width': 0.48, 'envelope_type': 'Plucky (AD Envelope)', # "Short" implies a plucky, not sustained, envelope 'decay_time_s': 0.4, 'vibrato_rate': 4.8, 'vibrato_depth': 2, # Mapped from the suggested 0.10 range 'bass_boost_level': 0.1, 'smooth_notes_level': 0.65, 'continuous_vibrato_level': 0.6, 'noise_level': 0.05, 'distortion_level': 0.05, 'fm_modulation_depth': 0.02, 'fm_modulation_rate': 20 }, # --- Sound FX & Experimental --- "Sci-Fi Energy Field": { # Description: (SFX) High-speed vibrato and noise create a constant, shimmering hum suitable for energy shields or force fields. 'waveform_type': 'Triangle', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.4, 'vibrato_rate': 10.0, 'vibrato_depth': 3, 'smooth_notes_level': 0.85, 'continuous_vibrato_level': 0.9, 'bass_boost_level': 0.1, 'noise_level': 0.1, 'distortion_level': 0.0, 'fm_modulation_depth': 0.05, 'fm_modulation_rate': 50 }, "Industrial Alarm": { # Description: (SFX) Extreme vibrato rate on a sawtooth wave produces a harsh, metallic, dissonant alarm sound. 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2, 'vibrato_rate': 15.0, 'vibrato_depth': 8, 'smooth_notes_level': 0.0, 'continuous_vibrato_level': 0.0, 'bass_boost_level': 0.3, 'noise_level': 0.2, 'distortion_level': 0.3, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, "Laser Charge-Up": { # Description: (SFX) Extreme vibrato depth creates a dramatic, rising pitch effect, perfect for sci-fi weapon sounds. 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.3, 'vibrato_rate': 4.0, 'vibrato_depth': 25, 'smooth_notes_level': 0.9, 'continuous_vibrato_level': 0.95, 'bass_boost_level': 0.2, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, "Unstable Machine Core": { # Description: (SFX) Maximum depth and distortion create a chaotic, atonal noise, simulating a machine on the verge of exploding. 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Sustained (Full Decay)', 'decay_time_s': 0.5, 'vibrato_rate': 1.0, 'vibrato_depth': 50, 'smooth_notes_level': 0.0, 'continuous_vibrato_level': 0.9, 'bass_boost_level': 0.5, 'noise_level': 0.3, 'distortion_level': 0.4, 'fm_modulation_depth': 0.5, 'fm_modulation_rate': 10 }, "Hardcore Gabber Kick": { # Description: (Experimental) Maximum bass boost and distortion create an overwhelmingly powerful, clipped kick drum sound. 'waveform_type': 'Sawtooth', 'pulse_width': 0.5, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.1, 'vibrato_rate': 0, 'vibrato_depth': 0, 'smooth_notes_level': 0.0, 'continuous_vibrato_level': 0.0, 'bass_boost_level': 0.8, 'noise_level': 0.2, 'distortion_level': 0.5, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, # --- Utility & Starting Points --- "Generic Chiptune Loop": { # Description: A well-balanced, pleasant square wave lead that serves as a great starting point for custom sounds. 'waveform_type': 'Square', 'pulse_width': 0.25, 'envelope_type': 'Plucky (AD Envelope)', 'decay_time_s': 0.2, 'vibrato_rate': 5.5, 'vibrato_depth': 4, 'smooth_notes_level': 0.9, 'continuous_vibrato_level': 0.85, 'bass_boost_level': 0.25, 'noise_level': 0.0, 'distortion_level': 0.0, 'fm_modulation_depth': 0.0, 'fm_modulation_rate': 0.0 }, } # --- Data structure for basic_pitch transcription presets --- BASIC_PITCH_PRESETS = { # --- General & All-Purpose --- "Default (Balanced)": { 'description': "A good all-around starting point for most music types.", 'onset_thresh': 0.5, 'frame_thresh': 0.3, 'min_note_len': 128, 'min_freq': 60, 'max_freq': 4000, 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': False }, "Anime / J-Pop": { 'description': "For tracks with clear melodies and pop/rock arrangements.", 'onset_thresh': 0.5, 'frame_thresh': 0.3, 'min_note_len': 150, 'min_freq': 40, 'max_freq': 2500, 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True }, # --- Specific Instruments --- "Solo Vocals": { 'description': "Optimized for a single singing voice. Sensitive to nuances.", 'onset_thresh': 0.4, 'frame_thresh': 0.3, 'min_note_len': 100, 'min_freq': 80, 'max_freq': 1200, 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True }, "Solo Piano": { 'description': "For solo piano with a wide dynamic and frequency range.", 'onset_thresh': 0.4, 'frame_thresh': 0.3, 'min_note_len': 120, 'min_freq': 27, 'max_freq': 4200, 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True }, "Acoustic Guitar": { 'description': "Balanced for picked or strummed acoustic guitar.", 'onset_thresh': 0.5, 'frame_thresh': 0.3, 'min_note_len': 90, 'min_freq': 80, 'max_freq': 2500, 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': False }, "Bass Guitar": { 'description': "Isolates and transcribes only the low frequencies of a bassline.", 'onset_thresh': 0.4, 'frame_thresh': 0.3, 'min_note_len': 100, 'min_freq': 30, 'max_freq': 400, 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': False }, "Percussion / Drums": { 'description': "For drums and rhythmic elements. Catches fast, sharp hits.", 'onset_thresh': 0.7, 'frame_thresh': 0.6, 'min_note_len': 30, 'min_freq': 40, 'max_freq': 10000, 'infer_onsets_bool': True, 'melodia_trick_bool': False, 'multiple_bends_bool': False }, # --- Complex Genres --- "Rock / Metal": { 'description': "Higher thresholds for distorted guitars, bass, and drums in a dense mix.", 'onset_thresh': 0.6, 'frame_thresh': 0.4, 'min_note_len': 100, 'min_freq': 50, 'max_freq': 3000, 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True }, "Jazz (Multi-instrument)": { 'description': "High thresholds to separate notes in complex, improvisational passages.", 'onset_thresh': 0.7, 'frame_thresh': 0.5, 'min_note_len': 150, 'min_freq': 55, 'max_freq': 2000, 'infer_onsets_bool': True, 'melodia_trick_bool': False, 'multiple_bends_bool': True }, "Classical (Orchestral)": { 'description': "Longer note length to focus on sustained notes and filter out performance noise.", 'onset_thresh': 0.5, 'frame_thresh': 0.4, 'min_note_len': 200, 'min_freq': 32, 'max_freq': 4200, 'infer_onsets_bool': True, 'melodia_trick_bool': True, 'multiple_bends_bool': True }, "Electronic / Synth": { 'description': "Low thresholds and short note length for sharp, synthetic sounds.", 'onset_thresh': 0.3, 'frame_thresh': 0.2, 'min_note_len': 50, 'min_freq': 20, 'max_freq': 8000, 'infer_onsets_bool': True, 'melodia_trick_bool': False, 'multiple_bends_bool': False } } # --- UI visibility logic now controls three components --- def update_vocal_ui_visibility(separate_vocals, remerge_audio): """Shows or hides the separation-related UI controls based on selections.""" is_visible = gr.update(visible=separate_vocals) # The "Transcribe Both" checkbox is only visible if separation AND re-merging are active transcribe_both_visible = gr.update(visible=(separate_vocals and remerge_audio)) return is_visible, is_visible, transcribe_both_visible def update_ui_visibility(transcription_method, soundfont_choice): """ Dynamically updates the visibility of UI components based on user selections. """ is_general = (transcription_method == "General Purpose") is_8bit = (soundfont_choice == SYNTH_8_BIT_LABEL) return { general_transcription_settings: gr.update(visible=is_general), synth_8bit_settings: gr.update(visible=is_8bit), } # --- Controller function to apply basic_pitch presets to the UI --- def apply_basic_pitch_preset(preset_name): if preset_name not in BASIC_PITCH_PRESETS: # If "Custom" is selected or name is invalid, don't change anything return {comp: gr.update() for comp in basic_pitch_ui_components} settings = BASIC_PITCH_PRESETS[preset_name] # Return a dictionary that maps each UI component to its new value return { onset_threshold: gr.update(value=settings['onset_thresh']), frame_threshold: gr.update(value=settings['frame_thresh']), minimum_note_length: gr.update(value=settings['min_note_len']), minimum_frequency: gr.update(value=settings['min_freq']), maximum_frequency: gr.update(value=settings['max_freq']), infer_onsets: gr.update(value=settings['infer_onsets_bool']), melodia_trick: gr.update(value=settings['melodia_trick_bool']), multiple_pitch_bends: gr.update(value=settings['multiple_bends_bool']) } # --- Function to apply 8-bit synthesizer presets --- # --- This function must be defined before the UI components that use it --- def apply_8bit_preset(preset_name): """ Takes the name of a preset and returns a dictionary of gr.update objects to set the values of all 13 of the 8-bit synthesizer's UI components. """ # --- Use a list of keys for consistent updates --- param_keys = [ 'waveform_type', 'pulse_width', 'envelope_type', 'decay_time_s', 'vibrato_rate', 'vibrato_depth', 'bass_boost_level', 'smooth_notes_level', 'continuous_vibrato_level', 'noise_level', 'distortion_level', 'fm_modulation_depth', 'fm_modulation_rate' ] # If the user selects "Custom" or the preset is not found, do not change the values. if preset_name == "Custom" or preset_name not in S8BIT_PRESETS: # When switching to custom, don't change any values, just return empty updates. return {comp: gr.update() for comp in s8bit_ui_components} # Get the settings dictionary for the chosen preset. settings = S8BIT_PRESETS[preset_name] # Create a dictionary mapping UI components to their new values from the preset. update_dict = {} for i, key in enumerate(param_keys): component = s8bit_ui_components[i] value = settings.get(key) if value is not None: update_dict[component] = gr.update(value=value) else: update_dict[component] = gr.update() return update_dict app = gr.Blocks(theme=gr.themes.Base()) with app: gr.Markdown("