File size: 7,419 Bytes
6b7f89f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import gradio as gr
import matplotlib.pyplot as plt
import librosa
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from moviepy.editor import *
from moviepy.video.io.VideoFileClip import VideoFileClip

def make_bars_image(height_values, index, new_height):
    
    # Define the size of the image
    width = 1024
    height = new_height
    
    # Create a new image with a transparent background
    image = Image.new('RGBA', (width, height), color=(0, 0, 0, 0))
    
    # Get the image drawing context
    draw = ImageDraw.Draw(image)
    
    # Define the rectangle width and spacing
    rect_width = 4
    spacing = 4
    
    # Define the list of height values for the rectangles
    #height_values = [20, 40, 60, 80, 100, 80, 60, 40]
    num_bars = len(height_values)
    # Calculate the total width of the rectangles and the spacing
    total_width = num_bars * rect_width + (num_bars - 1) * spacing
    
    # Calculate the starting position for the first rectangle
    start_x = int((width - total_width) / 2)
    # Define the buffer size
    buffer_size = int(80 * 2)
    # Draw the rectangles from left to right
    x = start_x
    for i, height in enumerate(height_values):
        
        # Define the rectangle coordinates
        y0 = buffer_size
        y1 = height + buffer_size
        x0 = x
        x1 = x + rect_width

        # Draw the rectangle
        draw.rectangle([x0, y0, x1, y1], fill='white')  
        
        # Move to the next rectangle position
        if i < num_bars - 1:
            x += rect_width + spacing
        

    # Rotate the image by 180 degrees
    image = image.rotate(180)
    
    # Mirror the image
    image = image.transpose(Image.FLIP_LEFT_RIGHT)
    
    # Save the image
    image.save('audio_bars_'+ str(index) + '.png')

    return 'audio_bars_'+ str(index) + '.png'

def db_to_height(db_value):
    # Scale the dB value to a range between 0 and 1
    scaled_value = (db_value + 80) / 80
    
    # Convert the scaled value to a height between 0 and 100
    height = scaled_value * 50
    
    return height

def infer(title, audio_in, image_in, output_video_path):
    # Load the audio file
    audio_path = audio_in
    audio_data, sr = librosa.load(audio_path)

    # Get the duration in seconds
    duration = librosa.get_duration(y=audio_data, sr=sr)
    
    # Extract the audio data for the desired time
    start_time = 0 # start time in seconds
    end_time = duration # end time in seconds
    
    start_index = int(start_time * sr)
    end_index = int(end_time * sr)
    
    audio_data = audio_data[start_index:end_index]
    
    # Compute the short-time Fourier transform
    hop_length = 1024

    
    stft = librosa.stft(audio_data, hop_length=hop_length)
    spectrogram = librosa.amplitude_to_db(np.abs(stft), ref=np.max)

    # Get the frequency values
    freqs = librosa.fft_frequencies(sr=sr, n_fft=stft.shape[0])

    # Select the indices of the frequency values that correspond to the desired frequencies
    n_freqs = 114
    freq_indices = np.linspace(0, len(freqs) - 1, n_freqs, dtype=int)
    
    # Extract the dB values for the desired frequencies
    db_values = []
    for i in range(spectrogram.shape[1]):
        db_values.append(list(zip(freqs[freq_indices], spectrogram[freq_indices, i])))
    
    # Print the dB values for the first time frame
    print(db_values[0])

    proportional_values = []

    for frame in db_values:
        proportional_frame = [db_to_height(db) for f, db in frame]
        proportional_values.append(proportional_frame)

    print(proportional_values[0])
    print("AUDIO CHUNK: " + str(len(proportional_values)))

    # Open the background image
    background_image = Image.open(image_in)
    
    # Resize the image while keeping its aspect ratio
    bg_width, bg_height = background_image.size
    aspect_ratio = bg_width / bg_height
    new_width = 1024
    new_height = int(new_width / aspect_ratio)
    resized_bg = background_image.resize((new_width, new_height))

    # Apply black cache for better visibility of the white text
    bg_cache = Image.open('black_cache.png')

    # Resize black_cache image to fit with the width
    black_cache_width, black_cache_height = bg_cache.size
    new_bc_width = 1024
    new_bc_height = black_cache_height * 2
    bg_cache = bg_cache.resize((new_bc_width, new_bc_height), Image.LANCZOS)
    
    resized_bg.paste(bg_cache, (0, resized_bg.height - bg_cache.height), mask=bg_cache)

    # Create a new ImageDraw object
    draw = ImageDraw.Draw(resized_bg)
    
    # Define the text to be added
    text = title
    font = ImageFont.truetype("Lato-Regular.ttf", 16)
    text_color = (255, 255, 255) # white color
    
    # Calculate the position of the text
    #text_width, text_height = draw.textsize(text, font=font)
    x = int(30 * 2)
    y = new_height - (70 * 2)
    
    # Draw the text on the image
    draw.text((x, y), text, fill=text_color, font=font)

    # Save the resized image
    resized_bg.save('resized_background.jpg')
    
    generated_frames = []
    for i, frame in enumerate(proportional_values): 
        bars_img = make_bars_image(frame, i, new_height)
        bars_img = Image.open(bars_img)
        # Paste the audio bars image on top of the background image
        fresh_bg = Image.open('resized_background.jpg')
        fresh_bg.paste(bars_img, (0, 0), mask=bars_img)
        # Save the image
        fresh_bg.save('audio_bars_with_bg' + str(i) + '.jpg')
        generated_frames.append('audio_bars_with_bg' + str(i) + '.jpg')
    print(generated_frames)

    # Create a video clip from the images
    clip = ImageSequenceClip(generated_frames, fps=len(generated_frames)/(end_time-start_time))
    audio_clip = AudioFileClip(audio_in)
    clip = clip.set_audio(audio_clip)
    # Set the output codec
    codec = 'libx264'
    audio_codec = 'aac'
    # Save the video to a file
    clip.write_videofile("my_video.mp4", codec=codec, audio_codec=audio_codec)

    retimed_clip = VideoFileClip("my_video.mp4")

    # Set the desired frame rate
    new_fps = 25
    
    # Create a new clip with the new frame rate
    new_clip = retimed_clip.set_fps(new_fps)
    
    # Save the new clip as a new video file
    new_clip.write_videofile(output_video_path, codec=codec, audio_codec=audio_codec)
    
    # Visualize the audio bars
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(spectrogram, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Audio Bars Visualization')
    
    # Save the image as a JPG file
    output_path = 'image_out.jpg'
    plt.savefig(output_path, dpi=300, bbox_inches='tight')

    #test make image bars
    #bars_img = make_bars_image(proportional_values[0])
    return output_video_path, 'image_out.jpg'

gr.Interface(fn=infer, 
             inputs=[gr.Textbox(placeholder='FIND A GOOD TITLE'), 
                     gr.Audio(source='upload', type='filepath'), 
                     gr.Image(source='upload', type='filepath'),
                     gr.Textbox(label="Output video path", value="my_final_video.mp4", visible=False)], 
             outputs=[gr.Video(label='video result'), gr.Image(label='spectrogram image')],
            title='Animated Audio Visualizer', description='<p style="text-align: center;">Upload an audio file, upload a background image, choose a good title, click submit.</p>').launch()