kofaceid

Running on Zero

App Files Files Community

aiqtech commited on 6 days ago

Commit

c1d518d

verified ·

1 Parent(s): fe7b6d8

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -103

app.py CHANGED Viewed

@@ -17,66 +17,70 @@ from diffusers import EulerDiscreteScheduler
 from PIL import Image
 from insightface.app import FaceAnalysis
-# Login with HF token
 HF_TOKEN = os.getenv("HF_TOKEN")
 if HF_TOKEN:
     login(token=HF_TOKEN)
     print("Successfully logged in to Hugging Face Hub")
-# Download models
 print("Downloading models...")
 ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors", token=HF_TOKEN)
 ckpt_dir_faceid = snapshot_download(repo_id="Kwai-Kolors/Kolors-IP-Adapter-FaceID-Plus", token=HF_TOKEN)
 print("Loading models on CPU first...")
-# Fix for ChatGLMTokenizer - monkey patch the _pad method
 original_chatglm_pad = ChatGLMTokenizer._pad if hasattr(ChatGLMTokenizer, '_pad') else None
 def fixed_pad(self, *args, **kwargs):
-    # Remove the unexpected 'padding_side' argument if present
     kwargs.pop('padding_side', None)
     if original_chatglm_pad:
         return original_chatglm_pad(self, *args, **kwargs)
     else:
         return super(ChatGLMTokenizer, self)._pad(*args, **kwargs)
 ChatGLMTokenizer._pad = fixed_pad
-# Load models
 text_encoder = ChatGLMModel.from_pretrained(
-    f'{ckpt_dir}/text_encoder',
-    torch_dtype=torch.float16,
     trust_remote_code=True
 )
 tokenizer = ChatGLMTokenizer.from_pretrained(
-    f'{ckpt_dir}/text_encoder',
     trust_remote_code=True
 )
 vae = AutoencoderKL.from_pretrained(
     f"{ckpt_dir}/vae",
-    torch_dtype=torch.float16
 )
 scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
 unet = UNet2DConditionModel.from_pretrained(
     f"{ckpt_dir}/unet",
-    torch_dtype=torch.float16
 )
-# Load CLIP
 clip_image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-    'openai/clip-vit-large-patch14-336',
-    torch_dtype=torch.float16,
     use_safetensors=True
 )
-clip_image_processor = CLIPImageProcessor(size=336, crop_size=336)
-# Create pipeline
 pipe = StableDiffusionXLPipeline(
     vae=vae,
     text_encoder=text_encoder,
@@ -90,25 +94,39 @@ pipe = StableDiffusionXLPipeline(
 print("Models loaded successfully!")
-class FaceInfoGenerator():
-    def __init__(self, root_dir="./.insightface/"):
         self.app = FaceAnalysis(
-            name='antelopev2',
             root=root_dir,
-            providers=['CPUExecutionProvider']
         )
         self.app.prepare(ctx_id=0, det_size=(640, 640))
-    def get_faceinfo_one_img(self, face_image):
         if face_image is None:
             return None
         face_info = self.app.get(cv2.cvtColor(np.array(face_image), cv2.COLOR_RGB2BGR))
         if len(face_info) == 0:
             return None
-        else:
-            face_info = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*(x['bbox'][3]-x['bbox'][1]))[-1]
         return face_info
 def face_bbox_to_square(bbox):
@@ -116,102 +134,116 @@ def face_bbox_to_square(bbox):
     cent_x = (l + r) / 2
     cent_y = (t + b) / 2
     w, h = r - l, b - t
-    r = max(w, h) / 2
-    l0 = cent_x - r
-    r0 = cent_x + r
-    t0 = cent_y - r
-    b0 = cent_y + r
-    return [l0, t0, r0, b0]
 MAX_SEED = np.iinfo(np.int32).max
 face_info_generator = FaceInfoGenerator()
 @spaces.GPU(duration=120)
-def infer(prompt,
-          image=None,
-          negative_prompt="low quality, blurry, distorted",
-          seed=66,
-          randomize_seed=False,
-          guidance_scale=5.0,
-          num_inference_steps=50
-        ):
     if image is None:
         gr.Warning("Please upload an image with a face.")
         return None, 0
-    # Face detection on CPU
     face_info = face_info_generator.get_faceinfo_one_img(image)
     if face_info is None:
         raise gr.Error("No face detected. Please upload an image with a clear face.")
     face_bbox_square = face_bbox_to_square(face_info["bbox"])
-    crop_image = image.crop(face_bbox_square)
-    crop_image = crop_image.resize((336, 336))
-    crop_image = [crop_image]
     face_embeds = torch.from_numpy(np.array([face_info["embedding"]]))
-    # Move to GPU
-    device = torch.device("cuda")
     global pipe
-    # Move models to GPU
-    pipe.vae = pipe.vae.to(device)
-    pipe.text_encoder = pipe.text_encoder.to(device)
-    pipe.unet = pipe.unet.to(device)
-    pipe.face_clip_encoder = pipe.face_clip_encoder.to(device)
-    face_embeds = face_embeds.to(device, dtype=torch.float16)
-    # Load IP adapter
-    pipe.load_ip_adapter_faceid_plus(f'{ckpt_dir_faceid}/ipa-faceid-plus.bin', device=device)
     pipe.set_face_fidelity_scale(0.8)
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator(device=device).manual_seed(seed)
-    # Generate image
     with torch.no_grad():
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
-            result = pipe(
                 prompt=prompt,
                 negative_prompt=negative_prompt,
                 height=1024,
                 width=1024,
-                num_inference_steps=num_inference_steps,
-                guidance_scale=guidance_scale,
                 num_images_per_prompt=1,
                 generator=generator,
                 face_crop_image=crop_image,
                 face_insightface_embeds=face_embeds
-            ).images[0]
-    # Move models back to CPU to free GPU memory
-    pipe.vae = pipe.vae.to("cpu")
-    pipe.text_encoder = pipe.text_encoder.to("cpu")
-    pipe.unet = pipe.unet.to("cpu")
-    pipe.face_clip_encoder = pipe.face_clip_encoder.to("cpu")
-    torch.cuda.empty_cache()
     return result, seed
 css = """
-footer {
-    visibility: hidden;
-}
-#col-left, #col-right {
-    max-width: 640px;
-    margin: 0 auto;
-}
-.gr-button {
-    max-width: 100%;
-}
 """
-# Gradio interface
 with gr.Blocks(theme="soft", css=css) as Kolors:
     gr.HTML(
         """
@@ -226,10 +258,13 @@ with gr.Blocks(theme="soft", css=css) as Kolors:
                     <img src="https://img.shields.io/badge/Discord-OpenFree%20AI-purple?style=for-the-badge&logo=discord" alt="Discord">
                 </a>
             </div>
         </div>
-        """
     )
     with gr.Row():
         with gr.Column(elem_id="col-left"):
             prompt = gr.Textbox(
@@ -239,7 +274,7 @@ with gr.Blocks(theme="soft", css=css) as Kolors:
                 value="A professional portrait photo, high quality"
             )
             image = gr.Image(label="Upload Face Image", type="pil", height=300)
             with gr.Accordion("Advanced Settings", open=False):
                 negative_prompt = gr.Textbox(
                     label="Negative prompt",
@@ -247,15 +282,15 @@ with gr.Blocks(theme="soft", css=css) as Kolors:
                 )
                 seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=66)
                 randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-                guidance_scale = gr.Slider(label="Guidance", minimum=1, maximum=10, step=0.5, value=5)
                 num_inference_steps = gr.Slider(label="Steps", minimum=10, maximum=50, step=5, value=25)
             button = gr.Button("🎨 Generate Portrait", variant="primary")
         with gr.Column(elem_id="col-right"):
             result = gr.Image(label="Generated Portrait")
             seed_used = gr.Number(label="Seed Used", precision=0)
     button.click(
         fn=infer,
         inputs=[prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps],
@@ -263,4 +298,4 @@ with gr.Blocks(theme="soft", css=css) as Kolors:
     )
 if __name__ == "__main__":
-    Kolors.queue(max_size=20).launch(debug=True)

 from PIL import Image
 from insightface.app import FaceAnalysis
+# ---------------------------
+# Runtime / device settings
+# ---------------------------
 HF_TOKEN = os.getenv("HF_TOKEN")
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
 if HF_TOKEN:
     login(token=HF_TOKEN)
     print("Successfully logged in to Hugging Face Hub")
 print("Downloading models...")
 ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors", token=HF_TOKEN)
 ckpt_dir_faceid = snapshot_download(repo_id="Kwai-Kolors/Kolors-IP-Adapter-FaceID-Plus", token=HF_TOKEN)
 print("Loading models on CPU first...")
+# ---------------------------
+# ChatGLM tokenizer pad fix
+# ---------------------------
 original_chatglm_pad = ChatGLMTokenizer._pad if hasattr(ChatGLMTokenizer, '_pad') else None
 def fixed_pad(self, *args, **kwargs):
     kwargs.pop('padding_side', None)
     if original_chatglm_pad:
         return original_chatglm_pad(self, *args, **kwargs)
     else:
         return super(ChatGLMTokenizer, self)._pad(*args, **kwargs)
 ChatGLMTokenizer._pad = fixed_pad
+# ---------------------------
+# Load Kolors components
+# NOTE: dtype is fp16 on CUDA, fp32 on CPU to avoid NaNs on CPU
+# ---------------------------
 text_encoder = ChatGLMModel.from_pretrained(
+    f"{ckpt_dir}/text_encoder",
+    torch_dtype=DTYPE,
     trust_remote_code=True
 )
 tokenizer = ChatGLMTokenizer.from_pretrained(
+    f"{ckpt_dir}/text_encoder",
     trust_remote_code=True
 )
 vae = AutoencoderKL.from_pretrained(
     f"{ckpt_dir}/vae",
+    torch_dtype=DTYPE
 )
 scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
 unet = UNet2DConditionModel.from_pretrained(
     f"{ckpt_dir}/unet",
+    torch_dtype=DTYPE
 )
+# CLIP image encoder + processor
 clip_image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+    "openai/clip-vit-large-patch14-336",
+    torch_dtype=DTYPE,
     use_safetensors=True
 )
+# Prefer from_pretrained for config parity
+clip_image_processor = CLIPImageProcessor.from_pretrained(
+    "openai/clip-vit-large-patch14-336"
+)
+# Create pipeline (initially on CPU to be safe with memory)
 pipe = StableDiffusionXLPipeline(
     vae=vae,
     text_encoder=text_encoder,
 print("Models loaded successfully!")
+# ---------------------------
+# InsightFace helper (CPU by default; GPU if available)
+# ---------------------------
+class FaceInfoGenerator:
+    def __init__(self, root_dir: str = "./.insightface/"):
+        providers = ["CPUExecutionProvider"]
+        # Try to prefer CUDA provider if available in runtime
+        try:
+            import onnxruntime as ort
+            if "CUDAExecutionProvider" in ort.get_available_providers():
+                providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+        except Exception:
+            pass
         self.app = FaceAnalysis(
+            name="antelopev2",
             root=root_dir,
+            providers=providers
         )
         self.app.prepare(ctx_id=0, det_size=(640, 640))
+    def get_faceinfo_one_img(self, face_image: Image.Image):
         if face_image is None:
             return None
+        # PIL RGB -> OpenCV BGR
         face_info = self.app.get(cv2.cvtColor(np.array(face_image), cv2.COLOR_RGB2BGR))
         if len(face_info) == 0:
             return None
+        # Largest face
+        face_info = sorted(
+            face_info,
+            key=lambda x: (x["bbox"][2] - x["bbox"][0]) * (x["bbox"][3] - x["bbox"][1])
+        )[-1]
         return face_info
 def face_bbox_to_square(bbox):
     cent_x = (l + r) / 2
     cent_y = (t + b) / 2
     w, h = r - l, b - t
+    rad = max(w, h) / 2
+    return [cent_x - rad, cent_y - rad, cent_x + rad, cent_y + rad]
 MAX_SEED = np.iinfo(np.int32).max
 face_info_generator = FaceInfoGenerator()
+# ---------------------------
+# Inference function
+# - Uses fp16 autocast only on CUDA
+# - Ensures dtype/device consistency to avoid NaNs
+# ---------------------------
 @spaces.GPU(duration=120)
+def infer(
+    prompt,
+    image=None,
+    negative_prompt="low quality, blurry, distorted",
+    seed=66,
+    randomize_seed=False,
+    guidance_scale=5.0,
+    num_inference_steps=25
+):
     if image is None:
         gr.Warning("Please upload an image with a face.")
         return None, 0
+    # Detect face (InsightFace)
     face_info = face_info_generator.get_faceinfo_one_img(image)
     if face_info is None:
         raise gr.Error("No face detected. Please upload an image with a clear face.")
+    # Prepare crop for IP-Adapter FaceID
     face_bbox_square = face_bbox_to_square(face_info["bbox"])
+    crop_image = image.crop(face_bbox_square).resize((336, 336))
+    crop_image = [crop_image]  # pipeline expects list
     face_embeds = torch.from_numpy(np.array([face_info["embedding"]]))
+    # Device move
+    device = torch.device(DEVICE)
     global pipe
+    # Move modules to device with proper dtype
+    pipe.vae = pipe.vae.to(device, dtype=DTYPE)
+    pipe.text_encoder = pipe.text_encoder.to(device, dtype=DTYPE)
+    pipe.unet = pipe.unet.to(device, dtype=DTYPE)
+    pipe.face_clip_encoder = pipe.face_clip_encoder.to(device, dtype=DTYPE)
+    face_embeds = face_embeds.to(device, dtype=DTYPE)
+    # Load IP-Adapter weights (FaceID Plus)
+    pipe.load_ip_adapter_faceid_plus(f"{ckpt_dir_faceid}/ipa-faceid-plus.bin", device=device)
     pipe.set_face_fidelity_scale(0.8)
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator(device=device).manual_seed(seed)
+    # Inference: autocast only on CUDA
     with torch.no_grad():
+        if DEVICE == "cuda":
+            with torch.autocast(device_type="cuda", dtype=torch.float16):
+                images = pipe(
+                    prompt=prompt,
+                    negative_prompt=negative_prompt,
+                    height=1024,
+                    width=1024,
+                    num_inference_steps=int(num_inference_steps),
+                    guidance_scale=float(guidance_scale),
+                    num_images_per_prompt=1,
+                    generator=generator,
+                    face_crop_image=crop_image,
+                    face_insightface_embeds=face_embeds
+                ).images
+        else:
+            images = pipe(
                 prompt=prompt,
                 negative_prompt=negative_prompt,
                 height=1024,
                 width=1024,
+                num_inference_steps=int(num_inference_steps),
+                guidance_scale=float(guidance_scale),
                 num_images_per_prompt=1,
                 generator=generator,
                 face_crop_image=crop_image,
                 face_insightface_embeds=face_embeds
+            ).images
+    result = images[0]
+    # Offload back to CPU to free GPU memory
+    try:
+        pipe.vae = pipe.vae.to("cpu")
+        pipe.text_encoder = pipe.text_encoder.to("cpu")
+        pipe.unet = pipe.unet.to("cpu")
+        pipe.face_clip_encoder = pipe.face_clip_encoder.to("cpu")
+        if DEVICE == "cuda":
+            torch.cuda.empty_cache()
+    except Exception:
+        pass
     return result, seed
+# ---------------------------
+# Gradio UI
+# ---------------------------
 css = """
+footer { visibility: hidden; }
+#col-left, #col-right { max-width: 640px; margin: 0 auto; }
+.gr-button { max-width: 100%; }
 """
 with gr.Blocks(theme="soft", css=css) as Kolors:
     gr.HTML(
         """
                     <img src="https://img.shields.io/badge/Discord-OpenFree%20AI-purple?style=for-the-badge&logo=discord" alt="Discord">
                 </a>
             </div>
+            <div style='margin-top:8px;font-size:12px;opacity:.7;'>
+                Device: {device}, DType: {dtype}
+            </div>
         </div>
+        """.format(device=DEVICE.upper(), dtype=str(DTYPE).replace("torch.", ""))
     )
     with gr.Row():
         with gr.Column(elem_id="col-left"):
             prompt = gr.Textbox(
                 value="A professional portrait photo, high quality"
             )
             image = gr.Image(label="Upload Face Image", type="pil", height=300)
             with gr.Accordion("Advanced Settings", open=False):
                 negative_prompt = gr.Textbox(
                     label="Negative prompt",
                 )
                 seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=66)
                 randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+                guidance_scale = gr.Slider(label="Guidance", minimum=1, maximum=10, step=0.5, value=5.0)
                 num_inference_steps = gr.Slider(label="Steps", minimum=10, maximum=50, step=5, value=25)
             button = gr.Button("🎨 Generate Portrait", variant="primary")
         with gr.Column(elem_id="col-right"):
             result = gr.Image(label="Generated Portrait")
             seed_used = gr.Number(label="Seed Used", precision=0)
     button.click(
         fn=infer,
         inputs=[prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps],
     )
 if __name__ == "__main__":
+    Kolors.queue(max_size=20).launch(debug=True)