import os os.environ["HF_HOME"] = "/data/huggingface" os.environ["TRANSFORMERS_CACHE"] = "/data/huggingface" os.makedirs("/data/huggingface/hub", exist_ok=True) import torch from diffusers import StableDiffusionImg2ImgPipeline from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection from PIL import Image # --- Place any download or path setup here --- old # MODEL_ID ="runwayml/stable-diffusion-v1-5" # Can swap for custom path if using IP-Adapter # ADAPTER_PATH = "/workspace/.cache/huggingface/ip_adapter/ip-adapter_sd15.bin" # ADAPTER_DIR = "/workspace/.cache/huggingface/ip_adapter" # DEVICE = "cpu" # MODEL_CACHE = "/workspace/.cache/huggingface" # ---- SETTINGS ---- MODEL_ID = "runwayml/stable-diffusion-v1-5" IPADAPTER_REPO = "h94/IP-Adapter" IPADAPTER_WEIGHT_NAME = "ip-adapter_sd15.bin" DEVICE = "cpu" # Change to "cuda" if you have GPU CACHE_DIR = os.environ.get("HF_HOME", "/data/huggingface") # (Optional) Download IP-Adapter weights and patch pipeline if desired # Load the model ONCE at startup, not per request! pipe = StableDiffusionImg2ImgPipeline.from_pretrained( MODEL_ID, torch_dtype=torch.float32, cache_dir=CACHE_DIR, # safety_checker=None, # Disable for demo/testing; enable in prod ).to(DEVICE) pipe.load_ip_adapter( pretrained_model_name_or_path_or_dict=IPADAPTER_REPO, subfolder="models", weight_name=IPADAPTER_WEIGHT_NAME ) # Load vision encoder and processor for IP-Adapter embedding vision_encoder = CLIPVisionModelWithProjection.from_pretrained( f"{IPADAPTER_REPO}/clip_vision_model", cache_dir=CACHE_DIR, ) image_processor = CLIPImageProcessor.from_pretrained( f"{IPADAPTER_REPO}/clip_vision_model", cache_dir=CACHE_DIR, ) def generate_sticker(input_image: Image.Image, prompt: str): """ Given a user image and a prompt, generates a sticker/emoji-style portrait. """ # Load the model (download if not present) # pipe = StableDiffusionImg2ImgPipeline.from_pretrained( # MODEL_ID, # torch_dtype=torch.float32, # cache_dir=MODEL_CACHE, # safety_checker=None, # Disable for demo/testing # ).to(DEVICE) # Preprocess the image (resize, etc) face_img = input_image.convert("RGB").resize((224, 224)) inputs = image_processor(images=face_img, return_tensors="pt").to(DEVICE) with torch.no_grad(): image_embeds = vision_encoder(**inputs).image_embeds # 2. Prepare image for SD pipeline init_image = input_image.convert("RGB").resize((512, 512)) # Run inference (low strength for identity preservation) result = pipe( prompt=prompt, image=init_image, image_embeds=image_embeds, strength=0.65, guidance_scale=7.5, num_inference_steps=30 ) # Return the generated image (as PIL) return result.images[0]