import gradio as gr from keras.models import load_model from tensorflow.keras.utils import img_to_array from tensorflow.keras.utils import load_img from numpy import expand_dims from PIL import Image import librosa import numpy as np import soundfile as sf import os import random # Load your Pix2Pix model (make sure the path is correct) model = load_model('./model_022600.h5', compile=False) # Function to shift frequencies def shift_frequencies(spectrogram, shift): return np.roll(spectrogram, shift, axis=0) # Function to apply a frequency filter def apply_filter(spectrogram, low_cut, high_cut): filtered = np.copy(spectrogram) filtered[:low_cut, :] = 0 # Attenuate low frequencies filtered[high_cut:, :] = 0 # Attenuate high frequencies return filtered # Function to add harmonics def add_harmonics(spectrogram, harmonic_shift): harmonics = np.roll(spectrogram, harmonic_shift, axis=0) * 0.5 # Weaken the harmonics return np.clip(spectrogram + harmonics, 0, 1) # Function to modulate the amplitude def modulate_amplitude(spectrogram, factor): return np.clip(spectrogram * factor, 0, 1) # Amplify or attenuate the white areas # Function to randomly decide which transformations to apply and with what parameters def modify_spectrogram(spectrogram): # Random decision for transformations apply_shift = random.choice([True, False]) apply_filtering = random.choice([True, False]) apply_harmonics = random.choice([True, False]) apply_amplitude_modulation = random.choice([True, False]) # Randomly select the values for each transformation if apply_shift: shift_value = random.randint(-15, 15) # Random shift between -15 and 15 print(f"Applying frequency shift: {shift_value}") spectrogram = shift_frequencies(spectrogram, shift=shift_value) if apply_filtering: low_cut = random.randint(10, 50) # Random low_cut between 10 and 50 high_cut = random.randint(300, 600) # Random high_cut between 300 and 600 print(f"Applying filter: low_cut={low_cut}, high_cut={high_cut}") spectrogram = apply_filter(spectrogram, low_cut=low_cut, high_cut=high_cut) if apply_harmonics: harmonic_shift = random.randint(2, 10) # Random harmonic shift between 2 and 10 print(f"Applying harmonic shift: {harmonic_shift}") spectrogram = add_harmonics(spectrogram, harmonic_shift=harmonic_shift) if apply_amplitude_modulation: factor = random.uniform(0.8, 2.0) # Random amplitude factor between 0.8 and 2.0 print(f"Applying amplitude modulation: factor={factor}") spectrogram = modulate_amplitude(spectrogram, factor=factor) return spectrogram # Function to process the input image and convert to audio def process_image(input_image): # Load and preprocess the input image def load_image(image, size=(256, 256)): image = image.resize(size) pixels = img_to_array(image) pixels = (pixels - 127.5) / 127.5 pixels = expand_dims(pixels, 0) return pixels # Preprocess the input src_image = load_image(input_image) # Generate output using the Pix2Pix model gen_image = model.predict(src_image) gen_image = (gen_image + 1) / 2.0 # scale to [0, 1] # Resize the generated image to original spectrogram size orig_size = (1293, 512) gen_image_resized = Image.fromarray((gen_image[0] * 255).astype('uint8')).resize(orig_size).convert('F') # Convert the image to a numpy array (spectrogram) img = np.array(gen_image_resized) # Modify the spectrogram randomly img = modify_spectrogram(img) # Convert the spectrogram back to audio using librosa wav = librosa.feature.inverse.mel_to_audio(img, sr=44100, n_fft=2048, hop_length=512) # Save the audio file to a temporary location audio_file = "generated_audio.wav" sf.write(audio_file, wav, samplerate=44100) return audio_file # Create a Gradio interface interface = gr.Interface( fn=process_image, inputs=gr.Image(type="pil"), # Input is an image outputs=gr.Audio(type="filepath"), # Output is an audio file title="Image to Audio Generator", # App title description="Upload an image (preferably a spectrogram), and get an audio file generated using Pix2Pix.", ) # Launch the interface interface.launch()