kofaceid

Running on Zero

App Files Files Community

kofaceid / app.py

aiqtech

Update app.py

ed12c55 verified 7 days ago

raw

history blame

10.5 kB

	import spaces
	import random
	import torch
	import cv2
	import insightface
	import gradio as gr
	import numpy as np
	import os
	from huggingface_hub import snapshot_download, login
	from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor
	from kolors.pipelines.pipeline_stable_diffusion_xl_chatglm_256_ipadapter_FaceID import StableDiffusionXLPipeline
	from kolors.models.modeling_chatglm import ChatGLMModel
	from kolors.models.tokenization_chatglm import ChatGLMTokenizer
	from diffusers import AutoencoderKL
	from kolors.models.unet_2d_condition import UNet2DConditionModel
	from diffusers import EulerDiscreteScheduler
	from PIL import Image
	from insightface.app import FaceAnalysis

	# ---------------------------
	# Runtime / device settings
	# ---------------------------
	HF_TOKEN = os.getenv("HF_TOKEN")
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32

	if HF_TOKEN:
	login(token=HF_TOKEN)
	print("Successfully logged in to Hugging Face Hub")

	print("Downloading models...")
	ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors", token=HF_TOKEN)
	ckpt_dir_faceid = snapshot_download(repo_id="Kwai-Kolors/Kolors-IP-Adapter-FaceID-Plus", token=HF_TOKEN)

	print("Loading models on CPU first...")

	# ---------------------------
	# ChatGLM tokenizer pad fix
	# ---------------------------
	original_chatglm_pad = ChatGLMTokenizer._pad if hasattr(ChatGLMTokenizer, '_pad') else None
	def fixed_pad(self, args, *kwargs):
	kwargs.pop('padding_side', None)
	if original_chatglm_pad:
	return original_chatglm_pad(self, args, *kwargs)
	else:
	return super(ChatGLMTokenizer, self)._pad(args, *kwargs)
	ChatGLMTokenizer._pad = fixed_pad

	# ---------------------------
	# Load Kolors components (dtype fp16 on CUDA, fp32 on CPU)
	# ---------------------------
	text_encoder = ChatGLMModel.from_pretrained(
	f"{ckpt_dir}/text_encoder",
	torch_dtype=DTYPE,
	trust_remote_code=True
	)
	tokenizer = ChatGLMTokenizer.from_pretrained(
	f"{ckpt_dir}/text_encoder",
	trust_remote_code=True
	)
	vae = AutoencoderKL.from_pretrained(
	f"{ckpt_dir}/vae",
	torch_dtype=DTYPE
	)
	scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
	unet = UNet2DConditionModel.from_pretrained(
	f"{ckpt_dir}/unet",
	torch_dtype=DTYPE
	)

	# CLIP image encoder + processor
	clip_image_encoder = CLIPVisionModelWithProjection.from_pretrained(
	"openai/clip-vit-large-patch14-336",
	torch_dtype=DTYPE,
	use_safetensors=True
	)
	clip_image_processor = CLIPImageProcessor.from_pretrained(
	"openai/clip-vit-large-patch14-336"
	)

	# Create pipeline (initially on CPU to be safe with memory)
	pipe = StableDiffusionXLPipeline(
	vae=vae,
	text_encoder=text_encoder,
	tokenizer=tokenizer,
	unet=unet,
	scheduler=scheduler,
	face_clip_encoder=clip_image_encoder,
	face_clip_processor=clip_image_processor,
	force_zeros_for_empty_prompt=False,
	)

	print("Models loaded successfully!")

	# ---------------------------
	# InsightFace helper (force CPU provider to avoid CUDA init errors)
	# ---------------------------
	class FaceInfoGenerator:
	def __init__(self, root_dir: str = "./.insightface/"):
	providers = ["CPUExecutionProvider"] # GPU 없는 환경에서 안전
	self.app = FaceAnalysis(
	name="antelopev2",
	root=root_dir,
	providers=providers
	)
	self.app.prepare(ctx_id=0, det_size=(640, 640))

	def get_faceinfo_one_img(self, face_image: Image.Image):
	if face_image is None:
	return None
	# PIL RGB -> OpenCV BGR
	face_info = self.app.get(cv2.cvtColor(np.array(face_image), cv2.COLOR_RGB2BGR))
	if len(face_info) == 0:
	return None
	# Largest face
	face_info = sorted(
	face_info,
	key=lambda x: (x["bbox"][2] - x["bbox"][0]) * (x["bbox"][3] - x["bbox"][1])
	)[-1]
	return face_info

	def face_bbox_to_square(bbox):
	l, t, r, b = bbox
	cent_x = (l + r) / 2
	cent_y = (t + b) / 2
	w, h = r - l, b - t
	rad = max(w, h) / 2
	return [cent_x - rad, cent_y - rad, cent_x + rad, cent_y + rad]

	MAX_SEED = np.iinfo(np.int32).max
	face_info_generator = FaceInfoGenerator()

	# ---------------------------
	# Inference function
	# - No @spaces.GPU decorator (GPU 없을 때 충돌 방지)
	# - Autocast only on CUDA
	# ---------------------------
	def infer(
	prompt,
	image=None,
	negative_prompt="low quality, blurry, distorted",
	seed=66,
	randomize_seed=False,
	guidance_scale=5.0,
	num_inference_steps=25
	):
	if image is None:
	gr.Warning("Please upload an image with a face.")
	return None, 0

	# Detect face (InsightFace on CPU)
	face_info = face_info_generator.get_faceinfo_one_img(image)
	if face_info is None:
	raise gr.Error("No face detected. Please upload an image with a clear face.")

	# Prepare crop for IP-Adapter FaceID
	face_bbox_square = face_bbox_to_square(face_info["bbox"])
	crop_image = image.crop(face_bbox_square).resize((336, 336))
	crop_image = [crop_image] # pipeline expects list
	face_embeds = torch.from_numpy(np.array([face_info["embedding"]]))

	# Device move
	device = torch.device(DEVICE)
	global pipe

	# Move modules to device with proper dtype
	pipe.vae = pipe.vae.to(device, dtype=DTYPE)
	pipe.text_encoder = pipe.text_encoder.to(device, dtype=DTYPE)
	pipe.unet = pipe.unet.to(device, dtype=DTYPE)
	pipe.face_clip_encoder = pipe.face_clip_encoder.to(device, dtype=DTYPE)
	face_embeds = face_embeds.to(device, dtype=DTYPE)

	# Load IP-Adapter weights (FaceID Plus)
	pipe.load_ip_adapter_faceid_plus(f"{ckpt_dir_faceid}/ipa-faceid-plus.bin", device=device)
	pipe.set_face_fidelity_scale(0.8)

	if randomize_seed:
	seed = random.randint(0, MAX_SEED)
	generator = torch.Generator(device=device).manual_seed(seed)

	# Inference: autocast only on CUDA
	with torch.no_grad():
	if DEVICE == "cuda":
	with torch.autocast(device_type="cuda", dtype=torch.float16):
	images = pipe(
	prompt=prompt,
	negative_prompt=negative_prompt,
	height=1024,
	width=1024,
	num_inference_steps=int(num_inference_steps),
	guidance_scale=float(guidance_scale),
	num_images_per_prompt=1,
	generator=generator,
	face_crop_image=crop_image,
	face_insightface_embeds=face_embeds
	).images
	else:
	images = pipe(
	prompt=prompt,
	negative_prompt=negative_prompt,
	height=1024,
	width=1024,
	num_inference_steps=int(num_inference_steps),
	guidance_scale=float(guidance_scale),
	num_images_per_prompt=1,
	generator=generator,
	face_crop_image=crop_image,
	face_insightface_embeds=face_embeds
	).images

	result = images[0]

	# Offload back to CPU to free GPU memory
	try:
	pipe.vae = pipe.vae.to("cpu")
	pipe.text_encoder = pipe.text_encoder.to("cpu")
	pipe.unet = pipe.unet.to("cpu")
	pipe.face_clip_encoder = pipe.face_clip_encoder.to("cpu")
	if DEVICE == "cuda":
	torch.cuda.empty_cache()
	except Exception:
	pass

	return result, seed

	# If CUDA is available, optionally wrap with spaces.GPU for scheduling
	if torch.cuda.is_available():
	infer = spaces.GPU(duration=120)(infer)

	# ---------------------------
	# Gradio UI
	# ---------------------------
	css = """
	footer { visibility: hidden; }
	#col-left, #col-right { max-width: 640px; margin: 0 auto; }
	.gr-button { max-width: 100%; }
	"""

	with gr.Blocks(theme="soft", css=css) as Kolors:
	gr.HTML(
	"""
	<div style='text-align: center;'>
	<h1>🎨 Kolors Face ID - AI Portrait Generator</h1>
	<p>Upload a face photo and create stunning AI portraits!</p>
	<div style='display:flex; justify-content:center; gap:12px; margin-top:20px;'>
	<a href="https://huggingface.co/spaces/openfree/Best-AI" target="_blank">
	<img src="https://img.shields.io/badge/OpenFree-BEST%20AI-blue?style=for-the-badge" alt="OpenFree">
	</a>
	<a href="https://discord.gg/openfreeai" target="_blank">
	<img src="https://img.shields.io/badge/Discord-OpenFree%20AI-purple?style=for-the-badge&logo=discord" alt="Discord">
	</a>
	</div>
	<div style='margin-top:8px;font-size:12px;opacity:.7;'>
	Device: {device}, DType: {dtype}
	</div>
	</div>
	""".format(device=DEVICE.upper(), dtype=str(DTYPE).replace("torch.", ""))
	)

	with gr.Row():
	with gr.Column(elem_id="col-left"):
	prompt = gr.Textbox(
	label="Prompt",
	placeholder="Describe the portrait style you want...",
	lines=3,
	value="A professional portrait photo, high quality"
	)
	image = gr.Image(label="Upload Face Image", type="pil", height=300)

	with gr.Accordion("Advanced Settings", open=False):
	negative_prompt = gr.Textbox(
	label="Negative prompt",
	value="low quality, blurry, distorted"
	)
	seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=66)
	randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
	guidance_scale = gr.Slider(label="Guidance", minimum=1, maximum=10, step=0.5, value=5.0)
	num_inference_steps = gr.Slider(label="Steps", minimum=10, maximum=50, step=5, value=25)

	button = gr.Button("🎨 Generate Portrait", variant="primary")

	with gr.Column(elem_id="col-right"):
	result = gr.Image(label="Generated Portrait")
	seed_used = gr.Number(label="Seed Used", precision=0)

	button.click(
	fn=infer,
	inputs=[prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps],
	outputs=[result, seed_used]
	)

	if __name__ == "__main__":
	Kolors.queue(max_size=20).launch(debug=True)