Kokoro-API-2

Sleeping

App Files Files Community

yaron123 commited on Jan 20

Commit

bf621da

1 Parent(s): d4b23f6

commit

Browse files

Files changed (1) hide show

app.py +36 -24

app.py CHANGED Viewed

@@ -31,7 +31,7 @@ from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file, save_file
 from diffusers import FluxPipeline
 from PIL import Image, ImageDraw, ImageFont
-from transformers import PegasusForConditionalGeneration, PegasusTokenizerFast
 from refiners.fluxion.utils import manual_seed
 from refiners.foundationals.latent_diffusion import Solver, solvers
 from refiners.foundationals.latent_diffusion.stable_diffusion_1.multi_upscaler import (
@@ -434,8 +434,8 @@ pegasus_name = "google/pegasus-xsum"
 # precision data
 seq=512
-width=720
-height=720
 image_steps=8
 img_accu=0
@@ -502,8 +502,8 @@ image_pipe.enable_model_cpu_offload()
 image_pipe.enable_vae_slicing()
 image_pipe.enable_vae_tiling()
-pegasus_tokenizer = PegasusTokenizerFast.from_pretrained("google/pegasus-xsum")
-pegasus_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
 # functionality
@@ -512,14 +512,14 @@ def upscaler(
     prompt: str = "Photorealistic, Hyperrealistic, Realistic Photography, High-Quality Photography, Natural.",
     negative_prompt: str = "Distorted, Discontinuous, Blurry, Doll-Like, Overly-Plastic, Low-Quality, Painted, Smoothed, Artificial, Phony, Gaudy, Digital Effects.",
     seed: int = int(str(random.random()).split(".")[1]),
-    upscale_factor: int = 4,
     controlnet_scale: float = 0.6,
     controlnet_decay: float = 1.0,
     condition_scale: int = 6,
     tile_width: int = 112,
     tile_height: int = 144,
     denoise_strength: float = 0.35,
-    num_inference_steps: int = 50,
     solver: str = "DDIM",
 ) -> Image.Image:
@@ -542,7 +542,7 @@ def upscaler(
         tile_size=(tile_height, tile_width),
         denoise_strength=denoise_strength,
         num_inference_steps=num_inference_steps,
-        loras_scale={"more_details": 0.5, "sdxl_render": 1.0},
         solver_type=solver_type,
     )
@@ -557,6 +557,9 @@ def summarize_text(
     summary = pegasus_tokenizer.decode( pegasus_model.generate(
         pegasus_tokenizer(text,return_tensors="pt").input_ids,
         max_length=max_length,
         early_stopping=True
     )[0], skip_special_tokens=True)
     log(f'RET summarize_text with summary as {summary}')
@@ -607,33 +610,33 @@ def add_song_cover_text(img,artist,song,height,width):
     return img
-@spaces.GPU(duration=180)
 def all_pipes(pos,neg,artist,song):
     imgs = pipe_generate_image(pos,neg)
-    names = []
     index = 1
-    for img in imgs:
-        enhanced_img = upscaler(img)
-        labeled_img = add_song_cover_text(enhanced_img,artist,song,height*4,width*4)
-        name = f'{artist} - {song} ({index}).png'
-        labeled_img.save(name)
-        names.append(name)
-    return names
-def handle_generate(artist,song,genre,lyrics):
     log(f'CALL handle_generate')
-    pos_artist = re.sub("([ \t\n]){1,}", " ", artist).strip()
-    pos_song = re.sub("([ \t\n]){1,}", " ", song).strip()
     pos_song = ' '.join(word[0].upper() + word[1:] for word in pos_song.split())
-    pos_genre = re.sub(f'[{punctuation}]', '', re.sub("([ \t\n]){1,}", " ", genre)).upper().strip()
     pos_lyrics = re.sub(f'[{punctuation}]', '', re.sub("([ \t\n]){1,}", " ", lyrics)).lower().strip()
     pos_lyrics_sum = pos_lyrics if pos_lyrics == "" else summarize_text(pos_lyrics)
     neg = f"Sexual, Textual, Labeled, Distorted, Discontinuous, Blurry, Doll-Like, Overly-Plastic, Low-Quality, Painted, Smoothed, Artificial, Phony, Gaudy, Digital Effects."
     q = "\""
-    pos = f'Photorealistic, Hyperrealistic, Realistic Photography, High-Quality Photography, Natural, made for the { pos_genre } SONG "{ pos_song }"{ pos_lyrics_sum if pos_lyrics_sum == "" else ": " + q + pos_lyrics_sum + q }.'
     print(f"""
         Positive: {pos}
@@ -641,7 +644,16 @@ def handle_generate(artist,song,genre,lyrics):
         Negative: {neg}
     """)
-    return all_pipes(pos,neg,pos_artist,pos_song)[0]
 # entry
@@ -683,7 +695,7 @@ if __name__ == "__main__":
         run = gr.Button("Generate",elem_classes="btn")
         run.click(
-            fn=handle_generate,
             inputs=[artist,song,genre,lyrics],
             outputs=[cover]
         )

 from safetensors.torch import load_file, save_file
 from diffusers import FluxPipeline
 from PIL import Image, ImageDraw, ImageFont
+from transformers import PegasusForConditionalGeneration, PegasusTokenizer
 from refiners.fluxion.utils import manual_seed
 from refiners.foundationals.latent_diffusion import Solver, solvers
 from refiners.foundationals.latent_diffusion.stable_diffusion_1.multi_upscaler import (
 # precision data
 seq=512
+width=1024
+height=1024
 image_steps=8
 img_accu=0
 image_pipe.enable_vae_slicing()
 image_pipe.enable_vae_tiling()
+pegasus_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
+pegasus_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum").to(device)
 # functionality
     prompt: str = "Photorealistic, Hyperrealistic, Realistic Photography, High-Quality Photography, Natural.",
     negative_prompt: str = "Distorted, Discontinuous, Blurry, Doll-Like, Overly-Plastic, Low-Quality, Painted, Smoothed, Artificial, Phony, Gaudy, Digital Effects.",
     seed: int = int(str(random.random()).split(".")[1]),
+    upscale_factor: int = 2,
     controlnet_scale: float = 0.6,
     controlnet_decay: float = 1.0,
     condition_scale: int = 6,
     tile_width: int = 112,
     tile_height: int = 144,
     denoise_strength: float = 0.35,
+    num_inference_steps: int = 30,
     solver: str = "DDIM",
 ) -> Image.Image:
         tile_size=(tile_height, tile_width),
         denoise_strength=denoise_strength,
         num_inference_steps=num_inference_steps,
+        loras_scale={"more_details": 1.0, "sdxl_render": 1.0},
         solver_type=solver_type,
     )
     summary = pegasus_tokenizer.decode( pegasus_model.generate(
         pegasus_tokenizer(text,return_tensors="pt").input_ids,
         max_length=max_length,
+        num_beams = 2,
+        truncation=True,
+        padding='longest',
         early_stopping=True
     )[0], skip_special_tokens=True)
     log(f'RET summarize_text with summary as {summary}')
     return img
+@spaces.GPU(duration=300)
 def all_pipes(pos,neg,artist,song):
     imgs = pipe_generate_image(pos,neg)
     index = 1
+    for i in range(len(imgs)):
+        imgs[i] = upscaler(imgs[i])
+    return imgs
+def handle_generation(artist,song,genre,lyrics):
     log(f'CALL handle_generate')
+    pos_artist = re.sub("([ \t\n]){1,}", " ", artist).upper().strip()
+    pos_song = re.sub("([ \t\n]){1,}", " ", song).lower().strip()
     pos_song = ' '.join(word[0].upper() + word[1:] for word in pos_song.split())
+    pos_genre = re.sub(f'[{punctuation}]', '', re.sub("([ \t\n]){1,}", " ", genre)).lower().strip()
+    pos_genre = ' '.join(word[0].upper() + word[1:] for word in pos_genre.split())
     pos_lyrics = re.sub(f'[{punctuation}]', '', re.sub("([ \t\n]){1,}", " ", lyrics)).lower().strip()
     pos_lyrics_sum = pos_lyrics if pos_lyrics == "" else summarize_text(pos_lyrics)
     neg = f"Sexual, Textual, Labeled, Distorted, Discontinuous, Blurry, Doll-Like, Overly-Plastic, Low-Quality, Painted, Smoothed, Artificial, Phony, Gaudy, Digital Effects."
     q = "\""
+    pos = f'Photorealistic, Hyperrealistic, Realistic Photography, High-Quality Photography, Natural, made for the { pos_genre } song "{ pos_song }"{ pos_lyrics_sum if pos_lyrics_sum == "" else ": " + q + pos_lyrics_sum + q }.'
     print(f"""
         Positive: {pos}
         Negative: {neg}
     """)
+    imgs = all_pipes(pos,neg,pos_artist,pos_song)
+    names = []
+    for img in imgs:
+        labeled_img = add_song_cover_text(img,artist,song,height*4,width*4)
+        name = f'{artist} - {song} ({index}).png'
+        labeled_img.save(name)
+        names.append(name)
+    return names
 # entry
         run = gr.Button("Generate",elem_classes="btn")
         run.click(
+            fn=handle_generation,
             inputs=[artist,song,genre,lyrics],
             outputs=[cover]
         )