Kokoro-API

Running

App Files Files Community

Yaron Koresh commited on Jan 16

Commit

e7149a6

verified ·

1 Parent(s): 093702c

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -60

app.py CHANGED Viewed

@@ -24,8 +24,7 @@ from lxml.html import fromstring
 from diffusers.utils import export_to_video, load_image
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file, save_file
-from diffusers import StableDiffusionPipeline, CogVideoXImageToVideoPipeline, DDIMScheduler #, AnimateDiffPipeline
-from diffusers.models import AutoencoderKL #, MotionAdapter
 from PIL import Image, ImageDraw, ImageFont
 # logging
@@ -48,23 +47,20 @@ else:
     device = "cpu"
     dtype = torch.float16
-#base = "emilianJR/epiCRealism"
-base = "SG161222/Realistic_Vision_V5.1_noVAE"
-vae_id = "stabilityai/sd-vae-ft-mse"
-#motion_adapter = "guoyww/animatediff-motion-adapter-v1-5-3"
 # variable data
-last_motion=""
 # precision data
 seq=512
-fast=False
 fps=20
 width=768
 height=768
-step=40
 accu=7
 # ui data
@@ -112,30 +108,14 @@ function custom(){
 # torch pipes
 image_pipe = StableDiffusionPipeline.from_pretrained(base, torch_dtype=dtype, safety_checker=None).to(device)
-#adapter = MotionAdapter.from_pretrained(motion_adapter, torch_dtype=dtype, safety_checker=None).to(device)
-vae = AutoencoderKL.from_pretrained(vae_id, torch_dtype=torch.float16).to(device)
-image_pipe.vae = vae
-scheduler = DDIMScheduler.from_pretrained(
-    base,
-    subfolder="scheduler",
-    clip_sample=False,
-    timestep_spacing="linspace",
-    beta_schedule="linear",
-    steps_offset=1,
-)
 video_pipe = CogVideoXImageToVideoPipeline.from_pretrained(
     "THUDM/CogVideoX-5b-I2V",
     torch_dtype=torch.bfloat16
 ).to(device)
-video_pipe.scheduler = scheduler
 video_pipe.vae.enable_tiling()
 video_pipe.vae.enable_slicing()
-#pipe.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
 video_pipe.enable_model_cpu_offload()
-#pipe.enable_free_init(method="butterworth", use_fast_sampling=fast)
 # functionality
@@ -199,7 +179,7 @@ def pipe_generate(img,p1,p2,motion,time,title):
             width=width,
             guidance_scale=accu,
             num_images_per_prompt=1,
-            num_inference_steps=step,
             max_sequence_length=seq,
             need_safetycheck=False,
             generator=torch.Generator(device).manual_seed(int(str(random.random()).split(".")[1]))
@@ -217,20 +197,11 @@ def pipe_generate(img,p1,p2,motion,time,title):
     if time == 0.0:
         return img
-    if last_motion != motion:
-        if last_motion != "":
-            pipe.unload_lora_weights()
-        if motion != "":
-            pipe.load_lora_weights(motion, adapter_name="motion")
-            pipe.fuse_lora()
-            pipe.set_adapters("motion", [0.7])
-        last_motion = motion
     return video_pipe(
         prompt=p1,
         negative_prompt=p2,
         image=img,
-        num_inference_steps=step,
         guidance_scale=accu,
         num_videos_per_prompt=1,
         num_frames=(fps*time),
@@ -245,14 +216,14 @@ def handle_generate(*_inp):
     inp[2] = translate(inp[2],"english")
     if inp[2] != "":
-        inp[2] = ", related to: " + inp[2] + "."
-    inp[2] = f"The content which is faked, errored, unreal, off topic, pixelated, deformed, and semi-realistic, cgi, 3d, sketch, cartoon, drawing, anime, cropped, out of frame, low quality, textual, jpeg artifacts, ugly, duplicated, weird, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutations, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck content{inp[2]}"
     if inp[1] != "":
-        inp[1] = ", related to: " + inp[1] + "."
-    inp[1] = f'The content which is photographed, realistic, true, genuine, dynamic poze, authentic, deep field, reasonable, natural, best quality, focused, highly detailed content{inp[1]}'
     print(f"""
@@ -305,27 +276,9 @@ def ui():
                 maximum=600.0,
                 value=0.0,
                 step=5.0,
-                label="MP4/PNG Duration (0s = PNG)"
             )
-            motion = gr.Dropdown(
-                label='GIF camera movement',
-                show_label=True,
-                container=False,
-                choices=[
-                    ("(No Effect)", ""),
-                    ("Zoom in", "guoyww/animatediff-motion-lora-zoom-in"),
-                    ("Zoom out", "guoyww/animatediff-motion-lora-zoom-out"),
-                    ("Tilt up", "guoyww/animatediff-motion-lora-tilt-up"),
-                    ("Tilt down", "guoyww/animatediff-motion-lora-tilt-down"),
-                    ("Pan left", "guoyww/animatediff-motion-lora-pan-left"),
-                    ("Pan right", "guoyww/animatediff-motion-lora-pan-right"),
-                    ("Roll left", "guoyww/animatediff-motion-lora-rolling-anticlockwise"),
-                    ("Roll right", "guoyww/animatediff-motion-lora-rolling-clockwise"),
-                ],
-                value="",
-                interactive=True
-            )
-        with gr.Row(elem_id="col-container"):
             with gr.Column():
                 img = gr.Image(label="Upload photo",show_label=True,container=False,type="pil")
             with gr.Column():

 from diffusers.utils import export_to_video, load_image
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file, save_file
+from diffusers import StableDiffusionPipeline, CogVideoXImageToVideoPipeline
 from PIL import Image, ImageDraw, ImageFont
 # logging
     device = "cpu"
     dtype = torch.float16
+base = "emilianJR/epiCRealism"
 # variable data
 # precision data
 seq=512
 fps=20
 width=768
 height=768
+image_steps=40
+video_steps=20
 accu=7
 # ui data
 # torch pipes
 image_pipe = StableDiffusionPipeline.from_pretrained(base, torch_dtype=dtype, safety_checker=None).to(device)
 video_pipe = CogVideoXImageToVideoPipeline.from_pretrained(
     "THUDM/CogVideoX-5b-I2V",
     torch_dtype=torch.bfloat16
 ).to(device)
 video_pipe.vae.enable_tiling()
 video_pipe.vae.enable_slicing()
 video_pipe.enable_model_cpu_offload()
 # functionality
             width=width,
             guidance_scale=accu,
             num_images_per_prompt=1,
+            num_inference_steps=image_steps,
             max_sequence_length=seq,
             need_safetycheck=False,
             generator=torch.Generator(device).manual_seed(int(str(random.random()).split(".")[1]))
     if time == 0.0:
         return img
     return video_pipe(
         prompt=p1,
         negative_prompt=p2,
         image=img,
+        num_inference_steps=video_steps,
         guidance_scale=accu,
         num_videos_per_prompt=1,
         num_frames=(fps*time),
     inp[2] = translate(inp[2],"english")
     if inp[2] != "":
+        inp[2] = ", related to: " + inp[2]
+    inp[2] = f"faked, errored, unreal, off topic, pixelated, deformed, and semi-realistic, cgi, 3d, sketch, cartoon, drawing, anime, cropped, out of frame, low quality, textual, jpeg artifacts, ugly, duplicated, weird, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutations, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck{inp[2]}"
     if inp[1] != "":
+        inp[1] = ", related to: " + inp[1]
+    inp[1] = f'photographed, realistic, true, genuine, dynamic poze, authentic, deep field, reasonable, natural, best quality, focused, highly detailed{inp[1]}'
     print(f"""
                 maximum=600.0,
                 value=0.0,
                 step=5.0,
+                label="Duration (0s = PNG)"
             )
+         with gr.Row(elem_id="col-container"):
             with gr.Column():
                 img = gr.Image(label="Upload photo",show_label=True,container=False,type="pil")
             with gr.Column():