File size: 4,398 Bytes
d076b8a
ebb57ae
 
 
 
d076b8a
ebb57ae
00093e0
ebb57ae
 
f412cfa
d076b8a
ebb57ae
 
 
f412cfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebb57ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f412cfa
 
 
ebb57ae
 
 
 
 
 
 
 
 
 
 
 
 
 
562e050
ebb57ae
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import gradio as gr
from keras.models import load_model
from tensorflow.keras.utils import img_to_array
from tensorflow.keras.utils import load_img
from numpy import expand_dims
from PIL import Image
import librosa
import numpy as np
import soundfile as sf
import os
import random

# Load your Pix2Pix model (make sure the path is correct)
model = load_model('./model_022600.h5', compile=False)


# Function to shift frequencies
def shift_frequencies(spectrogram, shift):
    return np.roll(spectrogram, shift, axis=0)

# Function to apply a frequency filter
def apply_filter(spectrogram, low_cut, high_cut):
    filtered = np.copy(spectrogram)
    filtered[:low_cut, :] = 0  # Attenuate low frequencies
    filtered[high_cut:, :] = 0  # Attenuate high frequencies
    return filtered

# Function to add harmonics
def add_harmonics(spectrogram, harmonic_shift):
    harmonics = np.roll(spectrogram, harmonic_shift, axis=0) * 0.5  # Weaken the harmonics
    return np.clip(spectrogram + harmonics, 0, 1)

# Function to modulate the amplitude
def modulate_amplitude(spectrogram, factor):
    return np.clip(spectrogram * factor, 0, 1)  # Amplify or attenuate the white areas

# Function to randomly decide which transformations to apply and with what parameters
def modify_spectrogram(spectrogram):
    # Random decision for transformations
    apply_shift = random.choice([True, False])
    apply_filtering = random.choice([True, False])
    apply_harmonics = random.choice([True, False])
    apply_amplitude_modulation = random.choice([True, False])
    
    # Randomly select the values for each transformation
    if apply_shift:
        shift_value = random.randint(-15, 15)  # Random shift between -15 and 15
        print(f"Applying frequency shift: {shift_value}")
        spectrogram = shift_frequencies(spectrogram, shift=shift_value)

    if apply_filtering:
        low_cut = random.randint(10, 50)  # Random low_cut between 10 and 50
        high_cut = random.randint(300, 600)  # Random high_cut between 300 and 600
        print(f"Applying filter: low_cut={low_cut}, high_cut={high_cut}")
        spectrogram = apply_filter(spectrogram, low_cut=low_cut, high_cut=high_cut)

    if apply_harmonics:
        harmonic_shift = random.randint(2, 10)  # Random harmonic shift between 2 and 10
        print(f"Applying harmonic shift: {harmonic_shift}")
        spectrogram = add_harmonics(spectrogram, harmonic_shift=harmonic_shift)

    if apply_amplitude_modulation:
        factor = random.uniform(0.8, 2.0)  # Random amplitude factor between 0.8 and 2.0
        print(f"Applying amplitude modulation: factor={factor}")
        spectrogram = modulate_amplitude(spectrogram, factor=factor)
    
    return spectrogram

# Function to process the input image and convert to audio
def process_image(input_image):
    # Load and preprocess the input image
    def load_image(image, size=(256, 256)):
        image = image.resize(size)
        pixels = img_to_array(image)
        pixels = (pixels - 127.5) / 127.5
        pixels = expand_dims(pixels, 0)
        return pixels

    # Preprocess the input
    src_image = load_image(input_image)
    
    # Generate output using the Pix2Pix model
    gen_image = model.predict(src_image)
    gen_image = (gen_image + 1) / 2.0  # scale to [0, 1]
    
    # Resize the generated image to original spectrogram size
    orig_size = (1293, 512)
    gen_image_resized = Image.fromarray((gen_image[0] * 255).astype('uint8')).resize(orig_size).convert('F')
    
    # Convert the image to a numpy array (spectrogram)
    img = np.array(gen_image_resized)

    # Modify the spectrogram randomly
    img = modify_spectrogram(img)
    
    # Convert the spectrogram back to audio using librosa
    wav = librosa.feature.inverse.mel_to_audio(img, sr=44100, n_fft=2048, hop_length=512)
    
    # Save the audio file to a temporary location
    audio_file = "generated_audio.wav"
    sf.write(audio_file, wav, samplerate=44100)
    
    return audio_file

# Create a Gradio interface
interface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="pil"),  # Input is an image
    outputs=gr.Audio(type="filepath"),  # Output is an audio file
    title="Image to Audio Generator",  # App title
    description="Upload an image (preferably a spectrogram), and get an audio file generated using Pix2Pix.",
)

# Launch the interface
interface.launch()