Spaces:

musdfakoc
/

local_intelligence

Sleeping

App Files Files Community

musdfakoc commited on Sep 30, 2024

Commit

ebb57ae

verified ·

1 Parent(s): 5a6b109

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -208

app.py CHANGED Viewed

@@ -1,212 +1,58 @@
-import torch
-import torchaudio
 import gradio as gr
 from PIL import Image
-import torchvision.transforms as transforms
-import torchaudio.transforms as T
-from torch import nn, optim
-import torchvision.transforms as transforms
-from torch.utils.data import Dataset, DataLoader
-from PIL import Image
-import os
 import numpy as np
-# Set device to 'cpu' or 'cuda' if available
-device = torch.device('cpu')
-# Parameters
-sample_rate = 44100  # 44.1kHz stereo sounds
-n_fft = 4096  # FFT size
-hop_length = 2048  # Hop length for STFT
-duration = 5  # Duration of the sound files (5 seconds)
-n_channels = 2  # Stereo sound
-output_time_frames = duration * sample_rate // hop_length  # Number of time frames in the spectrogram
-stft_transform = T.Spectrogram(n_fft=n_fft, hop_length=hop_length, win_length=n_fft)
-image_transform = transforms.Compose([
-    transforms.Resize((256, 256)),
-    transforms.ToTensor(),
-    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Normalize to [-1, 1]
-])
-# Image Encoder (for the Generator)
-class ImageEncoder(nn.Module):
-    def __init__(self):
-        super(ImageEncoder, self).__init__()
-        self.encoder = nn.Sequential(
-            nn.Conv2d(3, 64, kernel_size=4, stride=2, padding=1),
-            nn.BatchNorm2d(64),
-            nn.ReLU(),
-            nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1),
-            nn.BatchNorm2d(128),
-            nn.ReLU(),
-            nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1),
-            nn.BatchNorm2d(256),
-            nn.ReLU(),
-            nn.Conv2d(256, 512, kernel_size=4, stride=2, padding=1),
-            nn.BatchNorm2d(512),
-            nn.ReLU()
-        )
-        self.fc = nn.Linear(512 * 16 * 16, 512)
-    def forward(self, x):
-        x = self.encoder(x)
-        x = x.view(x.size(0), -1)
-        return self.fc(x)
-# Sound Decoder (for the Generator)
-class SoundDecoder(nn.Module):
-    def __init__(self, output_time_frames):
-        super(SoundDecoder, self).__init__()
-        self.fc = nn.Linear(512, 512 * 8 * 8)
-        self.decoder = nn.Sequential(
-            nn.ConvTranspose2d(512, 256, kernel_size=4, stride=2, padding=1),
-            nn.BatchNorm2d(256),
-            nn.ReLU(),
-            nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=1),
-            nn.BatchNorm2d(128),
-            nn.ReLU(),
-            nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1),
-            nn.BatchNorm2d(64),
-            nn.ReLU(),
-            nn.ConvTranspose2d(64, n_channels, kernel_size=4, stride=2, padding=1),
-        )
-        # Modify the upsample to exactly match the real spectrogram size (108 time frames)
-        self.upsample = nn.Upsample(size=(n_fft // 2 + 1, 108), mode='bilinear', align_corners=True)
-    def forward(self, x):
-        x = self.fc(x)
-        x = x.view(x.size(0), 512, 8, 8)
-        x = self.decoder(x)
-        x = self.upsample(x)
-        # Debugging shape
-        print(f'Generated spectrogram shape: {x.shape}')
-        return x
-# Generator model
-class Generator(nn.Module):
-    def __init__(self, output_time_frames):
-        super(Generator, self).__init__()
-        self.encoder = ImageEncoder()
-        self.decoder = SoundDecoder(output_time_frames)
-    def forward(self, img):
-        # Debugging: Image encoder
-        encoded_features = self.encoder(img)
-        print(f"Encoded features shape (from Image Encoder): {encoded_features.shape}")
-        # Debugging: Sound decoder
-        generated_spectrogram = self.decoder(encoded_features)
-        print(f"Generated spectrogram shape (from Sound Decoder): {generated_spectrogram.shape}")
-        return generated_spectrogram
-# Function to generate and save audio from a test image using the pre-trained GAN model
-def test_model(generator, test_img_path, output_audio_path, device):
-    # Load and preprocess test image
-    test_img = Image.open(test_img_path).convert('RGB')
-    test_img = image_transform(test_img).unsqueeze(0).to(device)  # Add batch dimension
-    # Generate sound spectrogram from the image
-    with torch.no_grad():  # Disable gradient calculation for inference
-        generated_spectrogram = generator(test_img)
-    # Debugging: Check generated spectrogram shape
-    print(f"Generated spectrogram shape: {generated_spectrogram.shape}")
-    # Convert the generated spectrogram to audio
-    generated_audio = spectrogram_to_audio(generated_spectrogram.squeeze(0).cpu())  # Remove batch dimension
-# Load the pre-trained GAN model
-def load_gan_model(generator, model_path, device):
-    generator.load_state_dict(torch.load(model_path, map_location=device))
-    generator.eval()  # Set the model to evaluation mode
-    return generator
-def magnitude_to_complex_spectrogram(magnitude_spectrogram):
-    # Clip values to avoid extreme values or potential invalid inputs
-    magnitude_spectrogram = torch.clamp(magnitude_spectrogram, min=1e-10, max=1e5)
-    zero_phase = torch.zeros_like(magnitude_spectrogram)
-    complex_spectrogram = torch.stack([magnitude_spectrogram, zero_phase], dim=-1)
-    # Check for NaNs in the complex spectrogram
-    if torch.isnan(complex_spectrogram).any():
-        raise ValueError("Complex spectrogram contains NaN values.")
-    return complex_spectrogram
-def spectrogram_to_audio(magnitude_spectrogram):
-    # Perform inverse log scaling to undo any log scaling
-    magnitude_spectrogram = torch.expm1(magnitude_spectrogram)
-    # Convert magnitude-only spectrogram to complex format (real part and zero imaginary)
-    zero_phase = torch.zeros_like(magnitude_spectrogram)
-    complex_spectrogram = torch.stack([magnitude_spectrogram, zero_phase], dim=-1)
-    # Inverse STFT to convert the spectrogram back to time-domain audio
-    audio = torch.istft(complex_spectrogram, n_fft=n_fft, hop_length=hop_length)
-    # Handle NaNs or Infs in the audio and replace them with zeros
-    audio = torch.nan_to_num(audio, nan=0.0, posinf=0.0, neginf=0.0)
-    # Normalize the audio to the range [-1, 1]
-    if torch.max(torch.abs(audio)) != 0:
-        audio = audio / torch.max(torch.abs(audio))
-    # Clip the audio to ensure no values fall outside the range [-1, 1]
-    audio = torch.clamp(audio, min=-1, max=1)
-    # Convert to 16-bit PCM format by scaling and casting to int16
-    audio = (audio * 32767).short()
-    # Ensure the audio is in the valid range for int16 [-32768, 32767]
-    audio = torch.clamp(audio, min=-32768, max=32767)
-    # Convert the audio to a NumPy array of int16
-    audio_numpy = audio.cpu().numpy().astype(np.int16)
-    return audio_numpy
-def generate_audio_from_image(image):
-    test_img = image_transform(image).unsqueeze(0).to(device)  # Preprocess the image
-    # Generate a sound spectrogram from the image using the loaded generator
-    with torch.no_grad():
-        generated_spectrogram = generator(test_img)
-    # Convert the generated spectrogram to time-domain audio
-    generated_audio_numpy = spectrogram_to_audio(generated_spectrogram.squeeze(0).cpu())
-    # Return the sample rate and the audio in the correct format for Gradio
-    return (sample_rate, generated_audio_numpy)
-# Gradio Interface
-def main():
-    global generator  # Declare the generator object globally
-    # Instantiate your Generator model
-    generator = Generator(output_time_frames).to(device)
-    # Load the pre-trained model
-    model_path = './gan_model.pth'  # Change this path
-    generator = load_gan_model(generator, model_path, device)
-    # Gradio interface: allow users to upload an image and generate audio
-    iface = gr.Interface(fn=generate_audio_from_image, inputs=gr.Image(type="pil"), outputs=gr.Audio(type="numpy", label="Generated Audio"))
-    iface.launch()
-if __name__ == "__main__":
-    main()

 import gradio as gr
+from keras.models import load_model
+from tensorflow.keras.utils import img_to_array
+from tensorflow.keras.utils import load_img
+from numpy import expand_dims
 from PIL import Image
+import librosa
 import numpy as np
+import soundfile as sf
+import os
+# Load your Pix2Pix model (make sure the path is correct)
+model = load_model('./model_022600.h5', compile=False)
+# Function to process the input image and convert to audio
+def process_image(input_image):
+    # Load and preprocess the input image
+    def load_image(image, size=(256, 256)):
+        image = image.resize(size)
+        pixels = img_to_array(image)
+        pixels = (pixels - 127.5) / 127.5
+        pixels = expand_dims(pixels, 0)
+        return pixels
+    # Preprocess the input
+    src_image = load_image(input_image)
+    # Generate output using the Pix2Pix model
+    gen_image = model.predict(src_image)
+    gen_image = (gen_image + 1) / 2.0  # scale to [0, 1]
+    # Resize the generated image to original spectrogram size
+    orig_size = (1293, 512)
+    gen_image_resized = Image.fromarray((gen_image[0] * 255).astype('uint8')).resize(orig_size).convert('F')
+    # Convert the image to a numpy array (spectrogram)
+    img = np.array(gen_image_resized)
+    # Convert the spectrogram back to audio using librosa
+    wav = librosa.feature.inverse.mel_to_audio(img, sr=44100, n_fft=2048, hop_length=512)
+    # Save the audio file to a temporary location
+    audio_file = "generated_audio.wav"
+    sf.write(audio_file, wav, samplerate=44100)
+    return audio_file
+# Create a Gradio interface
+interface = gr.Interface(
+    fn=process_image,
+    inputs=gr.Image(type="pil"),  # Input is an image
+    outputs=gr.Audio(type="file"),  # Output is an audio file
+    title="Image to Audio Generator",  # App title
+    description="Upload an image (preferably a spectrogram), and get an audio file generated using Pix2Pix.",
+)
+# Launch the interface
+interface.launch()