Kokoro-API-3

Running

App Files Files Community

yaron123 commited on Jan 19

Commit

446a991

1 Parent(s): 8af55c9

commit

Browse files

Files changed (1) hide show

app.py +77 -60

app.py CHANGED Viewed

@@ -435,8 +435,8 @@ pegasus_name = "google/pegasus-xsum"
 # precision data
 seq=512
-width=1024
-height=1024
 image_steps=8
 img_accu=0
@@ -511,8 +511,8 @@ pegasus_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-
 @spaces.GPU(duration=180)
 def upscaler(
     input_image: Image.Image,
-    prompt: str = "masterpiece, best quality, highres",
-    negative_prompt: str = "worst quality, low quality, normal quality",
     seed: int = int(str(random.random()).split(".")[1]),
     upscale_factor: int = 4,
     controlnet_scale: float = 0.6,
@@ -521,7 +521,7 @@ def upscaler(
     tile_width: int = 112,
     tile_height: int = 144,
     denoise_strength: float = 0.35,
-    num_inference_steps: int = 18,
     solver: str = "DDIM",
 ) -> Image.Image:
@@ -554,7 +554,7 @@ def upscaler(
 @spaces.GPU(duration=180)
 def summarize_text(
-    text, max_length=30, num_beams=16, early_stopping=True
 ):
     log(f'CALL summarize_text')
     summary = pegasus_tokenizer.decode( pegasus_model.generate(
@@ -573,18 +573,43 @@ def generate_random_string(length):
 @spaces.GPU(duration=180)
 def pipe_generate(p1,p2):
     log(f'CALL pipe_generate')
-    img = image_pipe(
             prompt=p1,
             negative_prompt=p2,
             height=height,
             width=width,
             guidance_scale=img_accu,
-            num_images_per_prompt=1,
             num_inference_steps=image_steps,
             max_sequence_length=seq,
             generator=torch.Generator(device).manual_seed(int(str(random.random()).split(".")[1]))
-    ).images[0]
     log(f'RET pipe_generate')
     return img
 def handle_generate(artist,song,genre,lyrics):
@@ -596,9 +621,9 @@ def handle_generate(artist,song,genre,lyrics):
     pos_song = ' '.join(word[0].upper() + word[1:] for word in pos_song.split())
     pos_genre = re.sub(f'[{punctuation}]', '', re.sub("([ \t\n]){1,}", " ", genre)).upper().strip()
     pos_lyrics = re.sub(f'[{punctuation}]', '', re.sub("([ \t\n]){1,}", " ", lyrics)).lower().strip()
-    pos_lyrics_sum = summarize_text(pos_lyrics)
     neg = f"Textual Labeled Distorted Discontinuous Ugly Blurry Low-Quality Worst-Quality Low-Resolution Painted"
-    pos = f'Realistic Vivid Genuine Reasonable Detailed 4K { pos_genre } GENRE { pos_song }: "{ pos_lyrics_sum }"'
     print(f"""
         Positive: {pos}
@@ -606,34 +631,17 @@ def handle_generate(artist,song,genre,lyrics):
         Negative: {neg}
     """)
-    img = pipe_generate(pos,neg)
-    draw = ImageDraw.Draw(img)
-    rows = 1
-    labels_distance = math.ceil(1 / 3)
-    textheight=min(math.ceil( width / 10 ), math.ceil( height / 5 ))
-    font = ImageFont.truetype(r"Alef-Bold.ttf", textheight)
-    textwidth = draw.textlength(pos_song,font)
-    x = math.ceil((width - textwidth) / 2)
-    y = height - math.ceil(textheight * rows / 2)
-    y = y - math.ceil(y / labels_distance)
-    draw.text((x, y), pos_song, (255,255,255), font=font, spacing=2, stroke_width=4, stroke_fill=(0,0,0))
-    textheight=min(math.ceil( width / 12 ), math.ceil( height / 6 ))
-    font = ImageFont.truetype(r"Alef-Bold.ttf", textheight)
-    textwidth = draw.textlength(pos_artist,font)
-    x = math.ceil((width - textwidth) / 2)
-    y = height - math.ceil(textheight * rows / 2)
-    y = y + math.ceil(y / labels_distance)
-    draw.text((x, y), pos_artist, (0,0,0), font=font, spacing=6, stroke_width=8, stroke_fill=(255,255,255))
-    enhanced_img = upscaler(img)
-    name = generate_random_string(12) + ".png"
-    enhanced_img.save(name)
-    return name
 # entry
@@ -643,36 +651,45 @@ if __name__ == "__main__":
             # Song Cover Image Generator
         """)
         with gr.Column():
-            with gr.Row():
-                artist = gr.Textbox(
-                    placeholder="Artist name",
-                    container=False,
-                    max_lines=1
                 )
-                song = gr.Textbox(
-                    placeholder="Song name",
                     container=False,
                     max_lines=1
                 )
-            genre = gr.Textbox(
-                    placeholder="Genre",
-                    container=False,
-                    max_lines=1
-            )
-            lyrics = gr.Textbox(
-                placeholder="Lyrics (English)",
-                container=False,
-                max_lines=1
-            )
-        with gr.Column():
-            cover = gr.Image(interactive=False,container=False,elem_classes="image-container", label="Result", show_label=True, type='filepath', show_share_button=False)
-        run = gr.Button("Generate",elem_classes="btn")
         run.click(
             fn=handle_generate,
             inputs=[artist,song,genre,lyrics],
-            outputs=[cover]
         )
     demo.queue().launch()

 # precision data
 seq=512
+width=768
+height=768
 image_steps=8
 img_accu=0
 @spaces.GPU(duration=180)
 def upscaler(
     input_image: Image.Image,
+    prompt: str = "Best-Quality Realistic Genuine Reasonable Highly-Detailed",
+    negative_prompt: str = "Distorted Discontinuous Ugly Blurry Low-Quality Worst-Quality Normal-Quality Low-Resolution Painted",
     seed: int = int(str(random.random()).split(".")[1]),
     upscale_factor: int = 4,
     controlnet_scale: float = 0.6,
     tile_width: int = 112,
     tile_height: int = 144,
     denoise_strength: float = 0.35,
+    num_inference_steps: int = 30,
     solver: str = "DDIM",
 ) -> Image.Image:
 @spaces.GPU(duration=180)
 def summarize_text(
+    text, max_length=30, num_beams=4, early_stopping=True
 ):
     log(f'CALL summarize_text')
     summary = pegasus_tokenizer.decode( pegasus_model.generate(
 @spaces.GPU(duration=180)
 def pipe_generate(p1,p2):
     log(f'CALL pipe_generate')
+    imgs = image_pipe(
             prompt=p1,
             negative_prompt=p2,
             height=height,
             width=width,
             guidance_scale=img_accu,
+            num_images_per_prompt=6,
             num_inference_steps=image_steps,
             max_sequence_length=seq,
             generator=torch.Generator(device).manual_seed(int(str(random.random()).split(".")[1]))
+    ).images
     log(f'RET pipe_generate')
+    return imgs
+def add_song_cover_text(img,artist,song):
+    draw = ImageDraw.Draw(img)
+    rows = 1
+    labels_distance = 1 / 2.5
+    textheight=min(math.ceil( width / 10 ), math.ceil( height / 5 ))
+    font = ImageFont.truetype(r"Alef-Bold.ttf", textheight)
+    textwidth = draw.textlength(song,font)
+    x = math.ceil((width - textwidth) / 2)
+    y = height - math.ceil(textheight * rows / 2)
+    y = y - math.ceil(y * labels_distance)
+    draw.text((x, y), song, (255,255,255), font=font, spacing=2, stroke_width=4, stroke_fill=(0,0,0))
+    textheight=min(math.ceil( width / 12 ), math.ceil( height / 6 ))
+    font = ImageFont.truetype(r"Alef-Bold.ttf", textheight)
+    textwidth = draw.textlength(artist,font)
+    x = math.ceil((width - textwidth) / 2)
+    y = height - math.ceil(textheight * rows / 2)
+    y = y + math.ceil(y * labels_distance)
+    draw.text((x, y), artist, (0,0,0), font=font, spacing=4, stroke_width=2, stroke_fill=(255,255,255))
     return img
 def handle_generate(artist,song,genre,lyrics):
     pos_song = ' '.join(word[0].upper() + word[1:] for word in pos_song.split())
     pos_genre = re.sub(f'[{punctuation}]', '', re.sub("([ \t\n]){1,}", " ", genre)).upper().strip()
     pos_lyrics = re.sub(f'[{punctuation}]', '', re.sub("([ \t\n]){1,}", " ", lyrics)).lower().strip()
+    pos_lyrics_sum = pos_lyrics == "" if pos_lyrics else summarize_text(pos_lyrics)
     neg = f"Textual Labeled Distorted Discontinuous Ugly Blurry Low-Quality Worst-Quality Low-Resolution Painted"
+    pos = f'Realistic Vivid Genuine Reasonable Highly-Detailed 4K { pos_genre } SONG { pos_song } { pos_lyrics_sum == "" if "INSTRUMENTAL" else "\"" + pos_lyrics_sum + "\"" }'
     print(f"""
         Positive: {pos}
         Negative: {neg}
     """)
+    imgs = pipe_generate(pos,neg)
+    names = []
+    index = 1
+    for img in imgs:
+        labeled_img = add_song_cover_text(img)
+        enhanced_img = upscaler(labeled_img)
+        name = f'{pos_artist} - {pos_song} ({index}).png'
+        enhanced_img.save(name)
+        names.append(name)
+    return *names
 # entry
             # Song Cover Image Generator
         """)
         with gr.Column():
+            with gr.Column():
+                with gr.Row():
+                    artist = gr.Textbox(
+                        placeholder="Artist name",
+                        value="",
+                        container=False,
+                        max_lines=1
+                    )
+                    song = gr.Textbox(
+                        placeholder="Song name",
+                        value="",
+                        container=False,
+                        max_lines=1
+                    )
+                genre = gr.Textbox(
+                        placeholder="Genre",
+                        value="",
+                        container=False,
+                        max_lines=1
                 )
+                lyrics = gr.Textbox(
+                    placeholder="Lyrics (English)",
+                    value="",
                     container=False,
                     max_lines=1
                 )
+            run = gr.Button("Generate",elem_classes="btn")
+            cover1 = gr.Image(interactive=False,container=False,elem_classes="image-container", label="Result", show_label=True, type='filepath', show_share_button=False)
+            cover2 = gr.Image(interactive=False,container=False,elem_classes="image-container", label="Result", show_label=True, type='filepath', show_share_button=False)
+            cover3 = gr.Image(interactive=False,container=False,elem_classes="image-container", label="Result", show_label=True, type='filepath', show_share_button=False)
+            cover4 = gr.Image(interactive=False,container=False,elem_classes="image-container", label="Result", show_label=True, type='filepath', show_share_button=False)
+            cover5 = gr.Image(interactive=False,container=False,elem_classes="image-container", label="Result", show_label=True, type='filepath', show_share_button=False)
+            cover6 = gr.Image(interactive=False,container=False,elem_classes="image-container", label="Result", show_label=True, type='filepath', show_share_button=False)
         run.click(
             fn=handle_generate,
             inputs=[artist,song,genre,lyrics],
+            outputs=[cover1,cover2,cover3,cover4,cover5,cover6]
         )
     demo.queue().launch()