Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import random | |
import torch | |
import cv2 | |
import insightface | |
import gradio as gr | |
import numpy as np | |
import os | |
from huggingface_hub import snapshot_download, login | |
from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor | |
from kolors.pipelines.pipeline_stable_diffusion_xl_chatglm_256_ipadapter_FaceID import StableDiffusionXLPipeline | |
from kolors.models.modeling_chatglm import ChatGLMModel | |
from kolors.models.tokenization_chatglm import ChatGLMTokenizer | |
from diffusers import AutoencoderKL | |
from kolors.models.unet_2d_condition import UNet2DConditionModel | |
from diffusers import EulerDiscreteScheduler | |
from PIL import Image | |
from insightface.app import FaceAnalysis | |
# --------------------------- | |
# Runtime / device settings | |
# --------------------------- | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32 | |
if HF_TOKEN: | |
login(token=HF_TOKEN) | |
print("Successfully logged in to Hugging Face Hub") | |
print("Downloading models...") | |
ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors", token=HF_TOKEN) | |
ckpt_dir_faceid = snapshot_download(repo_id="Kwai-Kolors/Kolors-IP-Adapter-FaceID-Plus", token=HF_TOKEN) | |
print("Loading models on CPU first...") | |
# --------------------------- | |
# ChatGLM tokenizer pad fix | |
# --------------------------- | |
original_chatglm_pad = ChatGLMTokenizer._pad if hasattr(ChatGLMTokenizer, '_pad') else None | |
def fixed_pad(self, *args, **kwargs): | |
kwargs.pop('padding_side', None) | |
if original_chatglm_pad: | |
return original_chatglm_pad(self, *args, **kwargs) | |
else: | |
return super(ChatGLMTokenizer, self)._pad(*args, **kwargs) | |
ChatGLMTokenizer._pad = fixed_pad | |
# --------------------------- | |
# Load Kolors components (dtype fp16 on CUDA, fp32 on CPU) | |
# --------------------------- | |
text_encoder = ChatGLMModel.from_pretrained( | |
f"{ckpt_dir}/text_encoder", | |
torch_dtype=DTYPE, | |
trust_remote_code=True | |
) | |
tokenizer = ChatGLMTokenizer.from_pretrained( | |
f"{ckpt_dir}/text_encoder", | |
trust_remote_code=True | |
) | |
vae = AutoencoderKL.from_pretrained( | |
f"{ckpt_dir}/vae", | |
torch_dtype=DTYPE | |
) | |
scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler") | |
unet = UNet2DConditionModel.from_pretrained( | |
f"{ckpt_dir}/unet", | |
torch_dtype=DTYPE | |
) | |
# CLIP image encoder + processor | |
clip_image_encoder = CLIPVisionModelWithProjection.from_pretrained( | |
"openai/clip-vit-large-patch14-336", | |
torch_dtype=DTYPE, | |
use_safetensors=True | |
) | |
clip_image_processor = CLIPImageProcessor.from_pretrained( | |
"openai/clip-vit-large-patch14-336" | |
) | |
# Create pipeline (initially on CPU to be safe with memory) | |
pipe = StableDiffusionXLPipeline( | |
vae=vae, | |
text_encoder=text_encoder, | |
tokenizer=tokenizer, | |
unet=unet, | |
scheduler=scheduler, | |
face_clip_encoder=clip_image_encoder, | |
face_clip_processor=clip_image_processor, | |
force_zeros_for_empty_prompt=False, | |
) | |
print("Models loaded successfully!") | |
# --------------------------- | |
# InsightFace helper (force CPU provider to avoid CUDA init errors) | |
# --------------------------- | |
class FaceInfoGenerator: | |
def __init__(self, root_dir: str = "./.insightface/"): | |
providers = ["CPUExecutionProvider"] # GPU 없는 환경에서 안전 | |
self.app = FaceAnalysis( | |
name="antelopev2", | |
root=root_dir, | |
providers=providers | |
) | |
self.app.prepare(ctx_id=0, det_size=(640, 640)) | |
def get_faceinfo_one_img(self, face_image: Image.Image): | |
if face_image is None: | |
return None | |
# PIL RGB -> OpenCV BGR | |
face_info = self.app.get(cv2.cvtColor(np.array(face_image), cv2.COLOR_RGB2BGR)) | |
if len(face_info) == 0: | |
return None | |
# Largest face | |
face_info = sorted( | |
face_info, | |
key=lambda x: (x["bbox"][2] - x["bbox"][0]) * (x["bbox"][3] - x["bbox"][1]) | |
)[-1] | |
return face_info | |
def face_bbox_to_square(bbox): | |
l, t, r, b = bbox | |
cent_x = (l + r) / 2 | |
cent_y = (t + b) / 2 | |
w, h = r - l, b - t | |
rad = max(w, h) / 2 | |
return [cent_x - rad, cent_y - rad, cent_x + rad, cent_y + rad] | |
MAX_SEED = np.iinfo(np.int32).max | |
face_info_generator = FaceInfoGenerator() | |
# --------------------------- | |
# Inference function | |
# - No @spaces.GPU decorator (GPU 없을 때 충돌 방지) | |
# - Autocast only on CUDA | |
# --------------------------- | |
def infer( | |
prompt, | |
image=None, | |
negative_prompt="low quality, blurry, distorted", | |
seed=66, | |
randomize_seed=False, | |
guidance_scale=5.0, | |
num_inference_steps=25 | |
): | |
if image is None: | |
gr.Warning("Please upload an image with a face.") | |
return None, 0 | |
# Detect face (InsightFace on CPU) | |
face_info = face_info_generator.get_faceinfo_one_img(image) | |
if face_info is None: | |
raise gr.Error("No face detected. Please upload an image with a clear face.") | |
# Prepare crop for IP-Adapter FaceID | |
face_bbox_square = face_bbox_to_square(face_info["bbox"]) | |
crop_image = image.crop(face_bbox_square).resize((336, 336)) | |
crop_image = [crop_image] # pipeline expects list | |
face_embeds = torch.from_numpy(np.array([face_info["embedding"]])) | |
# Device move | |
device = torch.device(DEVICE) | |
global pipe | |
# Move modules to device with proper dtype | |
pipe.vae = pipe.vae.to(device, dtype=DTYPE) | |
pipe.text_encoder = pipe.text_encoder.to(device, dtype=DTYPE) | |
pipe.unet = pipe.unet.to(device, dtype=DTYPE) | |
pipe.face_clip_encoder = pipe.face_clip_encoder.to(device, dtype=DTYPE) | |
face_embeds = face_embeds.to(device, dtype=DTYPE) | |
# Load IP-Adapter weights (FaceID Plus) | |
pipe.load_ip_adapter_faceid_plus(f"{ckpt_dir_faceid}/ipa-faceid-plus.bin", device=device) | |
pipe.set_face_fidelity_scale(0.8) | |
if randomize_seed: | |
seed = random.randint(0, MAX_SEED) | |
generator = torch.Generator(device=device).manual_seed(seed) | |
# Inference: autocast only on CUDA | |
with torch.no_grad(): | |
if DEVICE == "cuda": | |
with torch.autocast(device_type="cuda", dtype=torch.float16): | |
images = pipe( | |
prompt=prompt, | |
negative_prompt=negative_prompt, | |
height=1024, | |
width=1024, | |
num_inference_steps=int(num_inference_steps), | |
guidance_scale=float(guidance_scale), | |
num_images_per_prompt=1, | |
generator=generator, | |
face_crop_image=crop_image, | |
face_insightface_embeds=face_embeds | |
).images | |
else: | |
images = pipe( | |
prompt=prompt, | |
negative_prompt=negative_prompt, | |
height=1024, | |
width=1024, | |
num_inference_steps=int(num_inference_steps), | |
guidance_scale=float(guidance_scale), | |
num_images_per_prompt=1, | |
generator=generator, | |
face_crop_image=crop_image, | |
face_insightface_embeds=face_embeds | |
).images | |
result = images[0] | |
# Offload back to CPU to free GPU memory | |
try: | |
pipe.vae = pipe.vae.to("cpu") | |
pipe.text_encoder = pipe.text_encoder.to("cpu") | |
pipe.unet = pipe.unet.to("cpu") | |
pipe.face_clip_encoder = pipe.face_clip_encoder.to("cpu") | |
if DEVICE == "cuda": | |
torch.cuda.empty_cache() | |
except Exception: | |
pass | |
return result, seed | |
# If CUDA is available, optionally wrap with spaces.GPU for scheduling | |
if torch.cuda.is_available(): | |
infer = spaces.GPU(duration=120)(infer) | |
# --------------------------- | |
# Gradio UI | |
# --------------------------- | |
css = """ | |
footer { visibility: hidden; } | |
#col-left, #col-right { max-width: 640px; margin: 0 auto; } | |
.gr-button { max-width: 100%; } | |
""" | |
with gr.Blocks(theme="soft", css=css) as Kolors: | |
gr.HTML( | |
""" | |
<div style='text-align: center;'> | |
<h1>🎨 Kolors Face ID - AI Portrait Generator</h1> | |
<p>Upload a face photo and create stunning AI portraits!</p> | |
<div style='display:flex; justify-content:center; gap:12px; margin-top:20px;'> | |
<a href="https://huggingface.co/spaces/openfree/Best-AI" target="_blank"> | |
<img src="https://img.shields.io/badge/OpenFree-BEST%20AI-blue?style=for-the-badge" alt="OpenFree"> | |
</a> | |
<a href="https://discord.gg/openfreeai" target="_blank"> | |
<img src="https://img.shields.io/badge/Discord-OpenFree%20AI-purple?style=for-the-badge&logo=discord" alt="Discord"> | |
</a> | |
</div> | |
<div style='margin-top:8px;font-size:12px;opacity:.7;'> | |
Device: {device}, DType: {dtype} | |
</div> | |
</div> | |
""".format(device=DEVICE.upper(), dtype=str(DTYPE).replace("torch.", "")) | |
) | |
with gr.Row(): | |
with gr.Column(elem_id="col-left"): | |
prompt = gr.Textbox( | |
label="Prompt", | |
placeholder="Describe the portrait style you want...", | |
lines=3, | |
value="A professional portrait photo, high quality" | |
) | |
image = gr.Image(label="Upload Face Image", type="pil", height=300) | |
with gr.Accordion("Advanced Settings", open=False): | |
negative_prompt = gr.Textbox( | |
label="Negative prompt", | |
value="low quality, blurry, distorted" | |
) | |
seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=66) | |
randomize_seed = gr.Checkbox(label="Randomize seed", value=True) | |
guidance_scale = gr.Slider(label="Guidance", minimum=1, maximum=10, step=0.5, value=5.0) | |
num_inference_steps = gr.Slider(label="Steps", minimum=10, maximum=50, step=5, value=25) | |
button = gr.Button("🎨 Generate Portrait", variant="primary") | |
with gr.Column(elem_id="col-right"): | |
result = gr.Image(label="Generated Portrait") | |
seed_used = gr.Number(label="Seed Used", precision=0) | |
button.click( | |
fn=infer, | |
inputs=[prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps], | |
outputs=[result, seed_used] | |
) | |
if __name__ == "__main__": | |
Kolors.queue(max_size=20).launch(debug=True) | |