Kokoro-API-1

Sleeping

App Files Files Community

Yaron Koresh commited on Jan 21

Commit

8aa0947

verified ·

1 Parent(s): 8172c90

Update app.py

Browse files

Files changed (1) hide show

app.py +189 -33

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
 Modified parts included from these sources:
 - https://github.com/nidhaloff/deep-translator
 """
 import urllib
@@ -9,7 +10,7 @@ from bs4 import BeautifulSoup
 from abc import ABC, abstractmethod
 from pathlib import Path
 from langdetect import detect as get_language
-from typing import List, Optional, Union
 from collections import namedtuple
 from inspect import signature
 import os
@@ -38,9 +39,9 @@ import gradio as gr
 from lxml.html import fromstring
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file, save_file
-from diffusers import DiffusionPipeline, AutoencoderTiny
 from PIL import Image, ImageDraw, ImageFont
-from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer
 from refiners.fluxion.utils import manual_seed
 from refiners.foundationals.latent_diffusion import Solver, solvers
 from refiners.foundationals.latent_diffusion.stable_diffusion_1.multi_upscaler import (
@@ -51,8 +52,169 @@ from datetime import datetime
 working = False
-model = T5ForConditionalGeneration.from_pretrained("t5-large")
-tokenizer = T5Tokenizer.from_pretrained("t5-large")
 def log(msg):
     print(f'{datetime.now().time()} {msg}')
@@ -446,8 +608,8 @@ MAX_SEED = np.iinfo(np.int32).max
 # precision data
 seq=512
-image_steps=50
-img_accu=7.0
 # ui data
@@ -508,10 +670,13 @@ function custom(){
 # torch pipes
 taef1 = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=dtype).to(device)
 image_pipe = DiffusionPipeline.from_pretrained("ostris/Flex.1-alpha", torch_dtype=dtype, vae=taef1).to(device)
 image_pipe.enable_model_cpu_offload()
-image_pipe.enable_vae_slicing()
-image_pipe.enable_vae_tiling()
 # functionality
@@ -519,7 +684,7 @@ def upscaler(
     input_image: Image.Image,
     prompt: str = "Hyper realistic photography, Natural visual content.",
     negative_prompt: str = "Distorted, Discontinuous, Blurry, Doll-Like, Overly-Plastic, Low-Quality, Painted, Smoothed, Artificial, Phony, Gaudy, Digital Effects.",
-    seed: int = int(str(random.random()).split(".")[1]),
     upscale_factor: int = 2,
     controlnet_scale: float = 0.6,
     controlnet_decay: float = 1.0,
@@ -527,7 +692,7 @@ def upscaler(
     tile_width: int = 112,
     tile_height: int = 144,
     denoise_strength: float = 0.35,
-    num_inference_steps: int = 30,
     solver: str = "DDIM",
 ) -> Image.Image:
@@ -571,8 +736,8 @@ def _summarize(text):
     toks = tokenizer.encode( prefix + text, return_tensors="pt", truncation=False)
     gen = model.generate(
         toks,
-        length_penalty=0.01,
-        num_beams=8,
         early_stopping=True,
         max_length=512
     )
@@ -580,20 +745,11 @@ def _summarize(text):
     log(f'RET _summarize with ret as {ret}')
     return ret
-def summarize(text, max_words=10):
     log(f'CALL summarize')
     words = text.split()
-    if len(words) < 5:
-        print("Summarization Error: Text is too short, 5 words minimum.")
-        return text
-    if max_words < 5 or max_words > 500:
-        print("Summarization Error: max_words value must be between 5 and 500 words.")
-        return text
-    words_length = len(text.split())
     if words_length >= 510:
         while words_length >= 510:
@@ -606,12 +762,11 @@ def summarize(text, max_words=10):
             text = summ
             words_length = len(text.split())
-    while words_length > max_words:
         summ = _summarize(text)
         if summ == text:
             return text
         text = summ
-        words_length = len(text.split())
     log(f'RET summarize with text as {text}')
     return text
@@ -621,8 +776,7 @@ def generate_random_string(length):
     return ''.join(random.choice(characters) for _ in range(length))
 def pipe_generate_image(p1,p2,h,w):
-    log(f'CALL pipe_generate')
-    imgs = image_pipe(
             prompt=p1,
             negative_prompt=p2,
             height=h,
@@ -632,9 +786,8 @@ def pipe_generate_image(p1,p2,h,w):
             num_inference_steps=image_steps,
             max_sequence_length=seq,
             generator=torch.Generator(device).manual_seed(random.randint(0, MAX_SEED))
-    ).images
-    log(f'RET pipe_generate')
-    return imgs
 def add_song_cover_text(img,artist,song,h,w):
@@ -1273,6 +1426,9 @@ class GoogleTranslator(BaseTranslator):
 def translate(txt,to_lang="en",from_lang="auto"):
     log(f'CALL translate')
     translator = GoogleTranslator(from_lang=from_lang,to_lang=to_lang)
     translation = ""
     if len(txt) > 1000:
@@ -1323,7 +1479,7 @@ def handle_generation(artist,song,lyrics,h,w):
     pos_lyrics = pos_lyrics if pos_lyrics == "" else summarize(translate(pos_lyrics))
     pos_lyrics = re.sub(r"([ \t]){1,}", " ", pos_lyrics).lower().strip()
-    neg = f"Sexuality, Nudity, Human body, Human, Textual, Text, Distorted, Fake, Discontinuous, Blurry, Doll-Like, Overly Plastic, Low Quality, Paint, Smoothed, Artificial, Phony, Gaudy, Digital Effects."
     q = "\""
     pos = f'HQ Hyper-realistic professional photograph{ pos_lyrics if pos_lyrics == "" else ": " + q + pos_lyrics + q }.'
@@ -1336,7 +1492,7 @@ def handle_generation(artist,song,lyrics,h,w):
     img = all_pipes(pos,neg,h,w)
     labeled_img = add_song_cover_text(img,pos_artist,pos_song,h,w)
-    name = f'{generate_random_string(8)}.png'
     labeled_img.save(name)
     working = False

 """
 Modified parts included from these sources:
 - https://github.com/nidhaloff/deep-translator
+- https://huggingface.co/spaces/ostris/Flex.1-alpha
 """
 import urllib
 from abc import ABC, abstractmethod
 from pathlib import Path
 from langdetect import detect as get_language
+from typing import Any, Dict, List, Optional, Union
 from collections import namedtuple
 from inspect import signature
 import os
 from lxml.html import fromstring
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file, save_file
+from diffusers import DiffusionPipeline, AutoencoderTiny, FluxPipeline, FlowMatchEulerDiscreteScheduler
 from PIL import Image, ImageDraw, ImageFont
+from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer, CLIPTextModel, CLIPTokenizer, T5EncoderModel
 from refiners.fluxion.utils import manual_seed
 from refiners.foundationals.latent_diffusion import Solver, solvers
 from refiners.foundationals.latent_diffusion.stable_diffusion_1.multi_upscaler import (
 working = False
+model = T5ForConditionalGeneration.from_pretrained("t5-base")
+tokenizer = T5Tokenizer.from_pretrained("t5-base")
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+# FLUX pipeline function
+@torch.inference_mode()
+def flux_pipe_call_that_returns_an_iterable_of_images(
+    self,
+    prompt: Union[str, List[str]] = None,
+    prompt_2: Optional[Union[str, List[str]]] = None,
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+    num_inference_steps: int = 28,
+    timesteps: List[int] = None,
+    guidance_scale: float = 3.5,
+    num_images_per_prompt: Optional[int] = 1,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.FloatTensor] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    output_type: Optional[str] = "pil",
+    return_dict: bool = True,
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    max_sequence_length: int = 512,
+    good_vae: Optional[Any] = None,
+):
+    height = height or self.default_sample_size * self.vae_scale_factor
+    width = width or self.default_sample_size * self.vae_scale_factor
+    # 1. Check inputs
+    self.check_inputs(
+        prompt,
+        prompt_2,
+        height,
+        width,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        max_sequence_length=max_sequence_length,
+    )
+    self._guidance_scale = guidance_scale
+    self._joint_attention_kwargs = joint_attention_kwargs
+    self._interrupt = False
+    # 2. Define call parameters
+    batch_size = 1 if isinstance(prompt, str) else len(prompt)
+    device = self._execution_device
+    # 3. Encode prompt
+    lora_scale = joint_attention_kwargs.get("scale", None) if joint_attention_kwargs is not None else None
+    prompt_embeds, pooled_prompt_embeds, text_ids = self.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt_2,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        device=device,
+        num_images_per_prompt=num_images_per_prompt,
+        max_sequence_length=max_sequence_length,
+        lora_scale=lora_scale,
+    )
+    # 4. Prepare latent variables
+    num_channels_latents = self.transformer.config.in_channels // 4
+    latents, latent_image_ids = self.prepare_latents(
+        batch_size * num_images_per_prompt,
+        num_channels_latents,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        generator,
+        latents,
+    )
+    # 5. Prepare timesteps
+    sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+    image_seq_len = latents.shape[1]
+    mu = calculate_shift(
+        image_seq_len,
+        self.scheduler.config.base_image_seq_len,
+        self.scheduler.config.max_image_seq_len,
+        self.scheduler.config.base_shift,
+        self.scheduler.config.max_shift,
+    )
+    timesteps, num_inference_steps = retrieve_timesteps(
+        self.scheduler,
+        num_inference_steps,
+        device,
+        timesteps,
+        sigmas,
+        mu=mu,
+    )
+    self._num_timesteps = len(timesteps)
+    # Handle guidance
+    guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32).expand(latents.shape[0]) if self.transformer.config.guidance_embeds else None
+    # 6. Denoising loop
+    for i, t in enumerate(timesteps):
+        if self.interrupt:
+            continue
+        timestep = t.expand(latents.shape[0]).to(latents.dtype)
+        noise_pred = self.transformer(
+            hidden_states=latents,
+            timestep=timestep / 1000,
+            guidance=guidance,
+            pooled_projections=pooled_prompt_embeds,
+            encoder_hidden_states=prompt_embeds,
+            txt_ids=text_ids,
+            img_ids=latent_image_ids,
+            joint_attention_kwargs=self.joint_attention_kwargs,
+            return_dict=False,
+        )[0]
+        # Yield intermediate result
+        latents_for_image = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+        latents_for_image = (latents_for_image / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+        image = self.vae.decode(latents_for_image, return_dict=False)[0]
+        yield self.image_processor.postprocess(image, output_type=output_type)[0]
+        latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+        torch.cuda.empty_cache()
+    # Final image using good_vae
+    latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+    latents = (latents / good_vae.config.scaling_factor) + good_vae.config.shift_factor
+    image = good_vae.decode(latents, return_dict=False)[0]
+    self.maybe_free_model_hooks()
+    torch.cuda.empty_cache()
+    yield self.image_processor.postprocess(image, output_type=output_type)[0]
 def log(msg):
     print(f'{datetime.now().time()} {msg}')
 # precision data
 seq=512
+image_steps=25
+img_accu=3.5
 # ui data
 # torch pipes
 taef1 = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=dtype).to(device)
+good_vae = AutoencoderKL.from_pretrained("ostris/Flex.1-alpha", subfolder="vae", torch_dtype=dtype).to(device)
 image_pipe = DiffusionPipeline.from_pretrained("ostris/Flex.1-alpha", torch_dtype=dtype, vae=taef1).to(device)
 image_pipe.enable_model_cpu_offload()
+torch.cuda.empty_cache()
+image_pipe.flux_pipe_call_that_returns_an_iterable_of_images = flux_pipe_call_that_returns_an_iterable_of_images.__get__(image_pipe)
 # functionality
     input_image: Image.Image,
     prompt: str = "Hyper realistic photography, Natural visual content.",
     negative_prompt: str = "Distorted, Discontinuous, Blurry, Doll-Like, Overly-Plastic, Low-Quality, Painted, Smoothed, Artificial, Phony, Gaudy, Digital Effects.",
+    seed: int = random.randint(0, MAX_SEED),
     upscale_factor: int = 2,
     controlnet_scale: float = 0.6,
     controlnet_decay: float = 1.0,
     tile_width: int = 112,
     tile_height: int = 144,
     denoise_strength: float = 0.35,
+    num_inference_steps: int = 15,
     solver: str = "DDIM",
 ) -> Image.Image:
     toks = tokenizer.encode( prefix + text, return_tensors="pt", truncation=False)
     gen = model.generate(
         toks,
+        length_penalty=0.5,
+        num_beams=4,
         early_stopping=True,
         max_length=512
     )
     log(f'RET _summarize with ret as {ret}')
     return ret
+def summarize(text, max_len=500):
     log(f'CALL summarize')
     words = text.split()
+    words_length = len(words)
     if words_length >= 510:
         while words_length >= 510:
             text = summ
             words_length = len(text.split())
+    while len(text) > max_len:
         summ = _summarize(text)
         if summ == text:
             return text
         text = summ
     log(f'RET summarize with text as {text}')
     return text
     return ''.join(random.choice(characters) for _ in range(length))
 def pipe_generate_image(p1,p2,h,w):
+    for img in pipe.flux_pipe_call_that_returns_an_iterable_of_images(
             prompt=p1,
             negative_prompt=p2,
             height=h,
             num_inference_steps=image_steps,
             max_sequence_length=seq,
             generator=torch.Generator(device).manual_seed(random.randint(0, MAX_SEED))
+    ):
+        yield img
 def add_song_cover_text(img,artist,song,h,w):
 def translate(txt,to_lang="en",from_lang="auto"):
     log(f'CALL translate')
+    if from_lang == to_lang or get_language(txt) == to_lang:
+        print("Translation failed!")
+        return txt.strip().lower()
     translator = GoogleTranslator(from_lang=from_lang,to_lang=to_lang)
     translation = ""
     if len(txt) > 1000:
     pos_lyrics = pos_lyrics if pos_lyrics == "" else summarize(translate(pos_lyrics))
     pos_lyrics = re.sub(r"([ \t]){1,}", " ", pos_lyrics).lower().strip()
+    neg = f"Textual, Text, Distorted, Fake, Discontinuous, Blurry, Doll-Like, Overly Plastic, Low Quality, Paint, Smoothed, Artificial, Phony, Gaudy, Digital Effects."
     q = "\""
     pos = f'HQ Hyper-realistic professional photograph{ pos_lyrics if pos_lyrics == "" else ": " + q + pos_lyrics + q }.'
     img = all_pipes(pos,neg,h,w)
     labeled_img = add_song_cover_text(img,pos_artist,pos_song,h,w)
+    name = f'{generate_random_string(16)}.png'
     labeled_img.save(name)
     working = False