File size: 5,688 Bytes
d076b8a
ebb57ae
 
 
d076b8a
ebb57ae
00093e0
ebb57ae
 
f412cfa
9faab60
 
adf8868
d076b8a
ebb57ae
 
 
f412cfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73ba865
f412cfa
 
 
 
 
 
 
73ba865
f412cfa
 
 
73ba865
 
f412cfa
 
 
73ba865
f412cfa
 
 
73ba865
f412cfa
 
 
 
73ba865
8cdbc50
9faab60
 
 
 
adf8868
8cdbc50
 
9faab60
 
 
 
7ce9286
 
 
 
 
 
 
73ba865
ebb57ae
adf8868
 
8cdbc50
ebb57ae
 
 
 
 
 
 
7ce9286
 
 
ebb57ae
 
 
 
 
 
 
 
 
 
 
 
 
f412cfa
 
 
ebb57ae
adf8868
8cdbc50
9faab60
ebb57ae
 
 
adf8868
8cdbc50
 
ebb57ae
7ce9286
ebb57ae
73ba865
 
7ce9286
73ba865
 
 
7ce9286
73ba865
 
7ce9286
73ba865
7ce9286
73ba865
 
ebb57ae
73ba865
ebb57ae
7ce9286
73ba865
7ce9286
ebb57ae
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import gradio as gr
from keras.models import load_model
from tensorflow.keras.utils import img_to_array
from numpy import expand_dims
from PIL import Image
import librosa
import numpy as np
import soundfile as sf
import os
import random
import tempfile
import matplotlib.pyplot as plt
import time  # To generate unique filenames

# Load your Pix2Pix model (make sure the path is correct)
model = load_model('./model_022600.h5', compile=False)

# Function to shift frequencies
def shift_frequencies(spectrogram, shift):
    return np.roll(spectrogram, shift, axis=0)

# Function to apply a frequency filter
def apply_filter(spectrogram, low_cut, high_cut):
    filtered = np.copy(spectrogram)
    filtered[:low_cut, :] = 0  # Attenuate low frequencies
    filtered[high_cut:, :] = 0  # Attenuate high frequencies
    return filtered

# Function to add harmonics
def add_harmonics(spectrogram, harmonic_shift):
    harmonics = np.roll(spectrogram, harmonic_shift, axis=0) * 0.5  # Weaken the harmonics
    return np.clip(spectrogram + harmonics, 0, 1)

# Function to modulate the amplitude
def modulate_amplitude(spectrogram, factor):
    return np.clip(spectrogram * factor, 0, 1)  # Amplify or attenuate the white areas

# Function to randomly apply transformations
def modify_spectrogram(spectrogram):
    apply_shift = random.choice([True, False])
    apply_filtering = random.choice([True, False])
    apply_harmonics = random.choice([True, False])
    apply_amplitude_modulation = random.choice([True, False])
    
    if apply_shift:
        shift_value = random.randint(-15, 15)
        spectrogram = shift_frequencies(spectrogram, shift=shift_value)

    if apply_filtering:
        low_cut = random.randint(10, 50)
        high_cut = random.randint(300, 600)
        spectrogram = apply_filter(spectrogram, low_cut=low_cut, high_cut=high_cut)

    if apply_harmonics:
        harmonic_shift = random.randint(2, 10)
        spectrogram = add_harmonics(spectrogram, harmonic_shift=harmonic_shift)

    if apply_amplitude_modulation:
        factor = random.uniform(0.8, 2.0)
        spectrogram = modulate_amplitude(spectrogram, factor=factor)
    
    return spectrogram

# Save the modified spectrogram image for display
def save_spectrogram_image(spectrogram, name):
    plt.figure(figsize=(10, 4))
    plt.imshow(spectrogram, aspect='auto', origin='lower', cmap='gray')
    plt.axis('off')
    
    # Save the spectrogram image using the unique name
    temp_image_path = f"{name}_spectrogram.png"
    plt.savefig(temp_image_path, bbox_inches='tight', pad_inches=0)
    
    plt.close()
    return temp_image_path

# Save the uploaded image with the same timestamp
def save_uploaded_image(input_image, name):
    # Save the uploaded image with the same unique timestamp name
    uploaded_image_path = f"{name}_uploaded_image.png"
    input_image.save(uploaded_image_path)
    return uploaded_image_path

# Process the input image and convert to audio
def process_image(input_image):
    # Generate a unique name based on the current time
    image_name = f"image_{int(time.time())}"

    def load_image(image, size=(256, 256)):
        image = image.resize(size)
        pixels = img_to_array(image)
        pixels = (pixels - 127.5) / 127.5
        pixels = expand_dims(pixels, 0)
        return pixels

    # Save the uploaded image with the unique timestamp name
    uploaded_image_path = save_uploaded_image(input_image, image_name)

    # Preprocess the input
    src_image = load_image(input_image)
    
    # Generate output using the Pix2Pix model
    gen_image = model.predict(src_image)
    gen_image = (gen_image + 1) / 2.0  # scale to [0, 1]
    
    # Resize the generated image to original spectrogram size
    orig_size = (1293, 512)
    gen_image_resized = Image.fromarray((gen_image[0] * 255).astype('uint8')).resize(orig_size).convert('F')
    
    # Convert the image to a numpy array (spectrogram)
    img = np.array(gen_image_resized)

    # Modify the spectrogram randomly
    img = modify_spectrogram(img)
    
    # Save the modified spectrogram as an image, using the unique name
    spectrogram_image_path = save_spectrogram_image(img, image_name)
    
    # Convert the spectrogram back to audio using librosa
    wav = librosa.feature.inverse.mel_to_audio(img, sr=44100, n_fft=2048, hop_length=512)
    
    # Save the audio file, using the unique name
    audio_file_path = f"{image_name}_generated_audio.wav"
    sf.write(audio_file_path, wav, samplerate=44100)
    
    return uploaded_image_path, spectrogram_image_path, audio_file_path  # Return paths for uploaded image, spectrogram, and audio

# Gradio Interface
def gradio_process_image(input_image):
    uploaded_image_path, spectrogram_image_path, audio_file_path = process_image(input_image)

    # After Gradio finishes using these files, delete them to avoid keeping them around
    def cleanup():
        os.remove(uploaded_image_path)
        os.remove(spectrogram_image_path)
        os.remove(audio_file_path)
        print(f"Deleted temp files: {uploaded_image_path}, {spectrogram_image_path}, {audio_file_path}")

    return uploaded_image_path, spectrogram_image_path, audio_file_path, cleanup

# Create the Gradio interface
interface = gr.Interface(
    fn=gradio_process_image,
    inputs=gr.Image(type="pil"),  # Input is an image
    outputs=[gr.File(label="Uploaded Image"), gr.Image(type="filepath"), gr.Audio(type="filepath")],  # Output uploaded image, spectrogram, and audio file
    title="Image to Audio Generator with Spectrogram Display",
    description="Upload an image, and get an audio file generated using Pix2Pix.",
)

# Launch the interface
interface.launch()