musdfakoc's picture
Update app.py
f412cfa verified
raw
history blame
4.4 kB
import gradio as gr
from keras.models import load_model
from tensorflow.keras.utils import img_to_array
from tensorflow.keras.utils import load_img
from numpy import expand_dims
from PIL import Image
import librosa
import numpy as np
import soundfile as sf
import os
import random
# Load your Pix2Pix model (make sure the path is correct)
model = load_model('./model_022600.h5', compile=False)
# Function to shift frequencies
def shift_frequencies(spectrogram, shift):
return np.roll(spectrogram, shift, axis=0)
# Function to apply a frequency filter
def apply_filter(spectrogram, low_cut, high_cut):
filtered = np.copy(spectrogram)
filtered[:low_cut, :] = 0 # Attenuate low frequencies
filtered[high_cut:, :] = 0 # Attenuate high frequencies
return filtered
# Function to add harmonics
def add_harmonics(spectrogram, harmonic_shift):
harmonics = np.roll(spectrogram, harmonic_shift, axis=0) * 0.5 # Weaken the harmonics
return np.clip(spectrogram + harmonics, 0, 1)
# Function to modulate the amplitude
def modulate_amplitude(spectrogram, factor):
return np.clip(spectrogram * factor, 0, 1) # Amplify or attenuate the white areas
# Function to randomly decide which transformations to apply and with what parameters
def modify_spectrogram(spectrogram):
# Random decision for transformations
apply_shift = random.choice([True, False])
apply_filtering = random.choice([True, False])
apply_harmonics = random.choice([True, False])
apply_amplitude_modulation = random.choice([True, False])
# Randomly select the values for each transformation
if apply_shift:
shift_value = random.randint(-15, 15) # Random shift between -15 and 15
print(f"Applying frequency shift: {shift_value}")
spectrogram = shift_frequencies(spectrogram, shift=shift_value)
if apply_filtering:
low_cut = random.randint(10, 50) # Random low_cut between 10 and 50
high_cut = random.randint(300, 600) # Random high_cut between 300 and 600
print(f"Applying filter: low_cut={low_cut}, high_cut={high_cut}")
spectrogram = apply_filter(spectrogram, low_cut=low_cut, high_cut=high_cut)
if apply_harmonics:
harmonic_shift = random.randint(2, 10) # Random harmonic shift between 2 and 10
print(f"Applying harmonic shift: {harmonic_shift}")
spectrogram = add_harmonics(spectrogram, harmonic_shift=harmonic_shift)
if apply_amplitude_modulation:
factor = random.uniform(0.8, 2.0) # Random amplitude factor between 0.8 and 2.0
print(f"Applying amplitude modulation: factor={factor}")
spectrogram = modulate_amplitude(spectrogram, factor=factor)
return spectrogram
# Function to process the input image and convert to audio
def process_image(input_image):
# Load and preprocess the input image
def load_image(image, size=(256, 256)):
image = image.resize(size)
pixels = img_to_array(image)
pixels = (pixels - 127.5) / 127.5
pixels = expand_dims(pixels, 0)
return pixels
# Preprocess the input
src_image = load_image(input_image)
# Generate output using the Pix2Pix model
gen_image = model.predict(src_image)
gen_image = (gen_image + 1) / 2.0 # scale to [0, 1]
# Resize the generated image to original spectrogram size
orig_size = (1293, 512)
gen_image_resized = Image.fromarray((gen_image[0] * 255).astype('uint8')).resize(orig_size).convert('F')
# Convert the image to a numpy array (spectrogram)
img = np.array(gen_image_resized)
# Modify the spectrogram randomly
img = modify_spectrogram(img)
# Convert the spectrogram back to audio using librosa
wav = librosa.feature.inverse.mel_to_audio(img, sr=44100, n_fft=2048, hop_length=512)
# Save the audio file to a temporary location
audio_file = "generated_audio.wav"
sf.write(audio_file, wav, samplerate=44100)
return audio_file
# Create a Gradio interface
interface = gr.Interface(
fn=process_image,
inputs=gr.Image(type="pil"), # Input is an image
outputs=gr.Audio(type="filepath"), # Output is an audio file
title="Image to Audio Generator", # App title
description="Upload an image (preferably a spectrogram), and get an audio file generated using Pix2Pix.",
)
# Launch the interface
interface.launch()