Commit
·
399e621
1
Parent(s):
bcdb914
remove dlib dependency
Browse files- Dockerfile +0 -51
- README.md +2 -4
- app.py +2 -9
- requirements.txt +3 -1
- utils.py +62 -52
Dockerfile
DELETED
|
@@ -1,51 +0,0 @@
|
|
| 1 |
-
# FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
|
| 2 |
-
|
| 3 |
-
# RUN apt-get update && apt-get install -y \
|
| 4 |
-
# git cmake build-essential python3 python3-pip python3-dev \
|
| 5 |
-
# libboost-all-dev libopenblas-dev liblapack-dev
|
| 6 |
-
|
| 7 |
-
# WORKDIR /dlib
|
| 8 |
-
# RUN git clone https://github.com/davisking/dlib.git . \
|
| 9 |
-
# && mkdir build && cd build \
|
| 10 |
-
# && cmake .. -DDLIB_USE_CUDA=1 \
|
| 11 |
-
# && cmake --build . --config Release
|
| 12 |
-
|
| 13 |
-
# # install via pip so dependencies like wheel are used
|
| 14 |
-
# RUN pip install --upgrade pip wheel cmake setuptools && \
|
| 15 |
-
# cd /dlib && python3 setup.py install
|
| 16 |
-
|
| 17 |
-
# # Choose a writable directory, e.g., /home/user/huggingface
|
| 18 |
-
# RUN mkdir -p /home/user/huggingface
|
| 19 |
-
# RUN chmod -R 777 /home/user/huggingface
|
| 20 |
-
|
| 21 |
-
# # Then set environment variables
|
| 22 |
-
# ENV HF_HOME=/home/user/huggingface \
|
| 23 |
-
# HF_HUB_CACHE=/home/user/huggingface/hub \
|
| 24 |
-
# TRANSFORMERS_CACHE=/home/user/huggingface/hub
|
| 25 |
-
|
| 26 |
-
# WORKDIR /app
|
| 27 |
-
# COPY requirements.txt .
|
| 28 |
-
# RUN pip install -r requirements.txt
|
| 29 |
-
# COPY . .
|
| 30 |
-
# CMD ["python3", "app.py"]
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
FROM dillondrobena/opencv-cuda
|
| 34 |
-
|
| 35 |
-
WORKDIR /app
|
| 36 |
-
|
| 37 |
-
# Choose a writable directory, e.g., /home/user/huggingface
|
| 38 |
-
RUN mkdir -p /home/user/huggingface
|
| 39 |
-
RUN chmod -R 777 /home/user/huggingface
|
| 40 |
-
|
| 41 |
-
# Then set environment variables
|
| 42 |
-
ENV HF_HOME=/home/user/huggingface \
|
| 43 |
-
HF_HUB_CACHE=/home/user/huggingface/hub \
|
| 44 |
-
TRANSFORMERS_CACHE=/home/user/huggingface/hub
|
| 45 |
-
|
| 46 |
-
COPY requirements.txt .
|
| 47 |
-
RUN pip install --upgrade pip \
|
| 48 |
-
&& pip install -r requirements.txt
|
| 49 |
-
|
| 50 |
-
COPY . .
|
| 51 |
-
CMD ["python", "app.py"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -3,10 +3,8 @@ title: "Nested Attention: Semantic-aware Attention Values for Concept Personaliz
|
|
| 3 |
emoji: 🚀
|
| 4 |
colorFrom: indigo
|
| 5 |
colorTo: pink
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
sdk: docker
|
| 9 |
-
app_port: 7860
|
| 10 |
pinned: false
|
| 11 |
---
|
| 12 |
|
|
|
|
| 3 |
emoji: 🚀
|
| 4 |
colorFrom: indigo
|
| 5 |
colorTo: pink
|
| 6 |
+
sdk: gradio
|
| 7 |
+
app_file: app.py
|
|
|
|
|
|
|
| 8 |
pinned: false
|
| 9 |
---
|
| 10 |
|
app.py
CHANGED
|
@@ -5,14 +5,11 @@ import gradio as gr
|
|
| 5 |
from huggingface_hub import hf_hub_download, snapshot_download
|
| 6 |
from nested_attention_pipeline import NestedAdapterInference, add_special_token_to_tokenizer
|
| 7 |
from utils import align_face
|
| 8 |
-
import dlib
|
| 9 |
|
| 10 |
|
| 11 |
# ----------------------
|
| 12 |
# Configuration (update paths as needed)
|
| 13 |
# ----------------------
|
| 14 |
-
SHAPE_PREDICTOR_PATH = hf_hub_download("orpatashnik/NestedAttentionEncoder", "shape_predictor_68_face_landmarks.dat")
|
| 15 |
-
FACE_DETECTOR_PATH = hf_hub_download("orpatashnik/NestedAttentionEncoder", "mmod_human_face_detector.dat")
|
| 16 |
base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
|
| 17 |
image_encoder_path = snapshot_download("orpatashnik/NestedAttentionEncoder", allow_patterns=["image_encoder/**"])
|
| 18 |
image_encoder_path = os.path.join(image_encoder_path, "image_encoder")
|
|
@@ -40,10 +37,6 @@ ip_model = NestedAdapterInference(
|
|
| 40 |
device=device
|
| 41 |
)
|
| 42 |
|
| 43 |
-
# Initialize face alignment predictor
|
| 44 |
-
predictor = dlib.shape_predictor(SHAPE_PREDICTOR_PATH)
|
| 45 |
-
detector = dlib.cnn_face_detection_model_v1(FACE_DETECTOR_PATH)
|
| 46 |
-
|
| 47 |
# Generation defaults
|
| 48 |
negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
|
| 49 |
num_inference_steps = 30
|
|
@@ -59,7 +52,7 @@ def generate_images(img1, img2, img3, prompt, w, num_samples, seed):
|
|
| 59 |
return []
|
| 60 |
|
| 61 |
# Align directly on PIL
|
| 62 |
-
aligned_refs = [align_face(img
|
| 63 |
|
| 64 |
# Resize to model resolution
|
| 65 |
pil_images = [aligned.resize((512, 512)) for aligned in aligned_refs]
|
|
@@ -111,4 +104,4 @@ with gr.Blocks() as demo:
|
|
| 111 |
outputs=output_gallery
|
| 112 |
)
|
| 113 |
|
| 114 |
-
demo.launch()
|
|
|
|
| 5 |
from huggingface_hub import hf_hub_download, snapshot_download
|
| 6 |
from nested_attention_pipeline import NestedAdapterInference, add_special_token_to_tokenizer
|
| 7 |
from utils import align_face
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
# ----------------------
|
| 11 |
# Configuration (update paths as needed)
|
| 12 |
# ----------------------
|
|
|
|
|
|
|
| 13 |
base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
|
| 14 |
image_encoder_path = snapshot_download("orpatashnik/NestedAttentionEncoder", allow_patterns=["image_encoder/**"])
|
| 15 |
image_encoder_path = os.path.join(image_encoder_path, "image_encoder")
|
|
|
|
| 37 |
device=device
|
| 38 |
)
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
# Generation defaults
|
| 41 |
negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
|
| 42 |
num_inference_steps = 30
|
|
|
|
| 52 |
return []
|
| 53 |
|
| 54 |
# Align directly on PIL
|
| 55 |
+
aligned_refs = [align_face(img) for img in refs]
|
| 56 |
|
| 57 |
# Resize to model resolution
|
| 58 |
pil_images = [aligned.resize((512, 512)) for aligned in aligned_refs]
|
|
|
|
| 104 |
outputs=output_gallery
|
| 105 |
)
|
| 106 |
|
| 107 |
+
demo.launch(share=True)
|
requirements.txt
CHANGED
|
@@ -6,4 +6,6 @@ gradio
|
|
| 6 |
huggingface_hub
|
| 7 |
einops
|
| 8 |
scipy
|
| 9 |
-
accelerate
|
|
|
|
|
|
|
|
|
| 6 |
huggingface_hub
|
| 7 |
einops
|
| 8 |
scipy
|
| 9 |
+
accelerate
|
| 10 |
+
insightface
|
| 11 |
+
onnxruntime
|
utils.py
CHANGED
|
@@ -1,9 +1,15 @@
|
|
|
|
|
| 1 |
from PIL import Image
|
|
|
|
|
|
|
| 2 |
import torch
|
| 3 |
-
import numpy as np
|
| 4 |
-
import dlib
|
| 5 |
import scipy
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
def image_grid(imgs, rows, cols):
|
| 8 |
assert len(imgs) == rows*cols
|
| 9 |
|
|
@@ -30,45 +36,37 @@ def get_generator(seed, device):
|
|
| 30 |
|
| 31 |
return generator
|
| 32 |
|
| 33 |
-
def
|
| 34 |
-
"""Get 68 facial landmarks
|
| 35 |
img_np = np.array(pil_image.convert("RGB"))
|
| 36 |
-
|
| 37 |
-
if not
|
| 38 |
return None
|
| 39 |
-
#
|
| 40 |
-
|
| 41 |
-
shape
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
"""Align a face from a PIL.Image, returning an aligned PIL.Image of size 512x512."""
|
| 48 |
-
lm =
|
| 49 |
-
if lm is None:
|
| 50 |
return pil_image
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
lm_eyebrow_left = lm[17: 22] # left-right
|
| 54 |
-
lm_eyebrow_right = lm[22: 27] # left-right
|
| 55 |
-
lm_nose = lm[27: 31] # top-down
|
| 56 |
-
lm_nostrils = lm[31: 36] # top-down
|
| 57 |
-
lm_eye_left = lm[36: 42] # left-clockwise
|
| 58 |
-
lm_eye_right = lm[42: 48] # left-clockwise
|
| 59 |
-
lm_mouth_outer = lm[48: 60] # left-clockwise
|
| 60 |
-
lm_mouth_inner = lm[60: 68] # left-clockwise
|
| 61 |
-
|
| 62 |
-
eye_left = np.mean(lm_eye_left, axis=0)
|
| 63 |
-
eye_right = np.mean(lm_eye_right, axis=0)
|
| 64 |
eye_avg = (eye_left + eye_right) * 0.5
|
| 65 |
eye_to_eye = eye_right - eye_left
|
| 66 |
-
mouth_left =
|
| 67 |
-
mouth_right = lm_mouth_outer[6]
|
| 68 |
mouth_avg = (mouth_left + mouth_right) * 0.5
|
| 69 |
eye_to_mouth = mouth_avg - eye_avg
|
| 70 |
|
| 71 |
-
#
|
| 72 |
x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
|
| 73 |
x /= np.hypot(*x)
|
| 74 |
x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
|
|
@@ -77,52 +75,64 @@ def align_face(pil_image, predictor, detector):
|
|
| 77 |
quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
|
| 78 |
qsize = np.hypot(*x) * 2
|
| 79 |
|
| 80 |
-
# Prepare image
|
| 81 |
img = pil_image.convert("RGB")
|
| 82 |
transform_size = 512
|
| 83 |
output_size = 512
|
| 84 |
enable_padding = True
|
| 85 |
|
| 86 |
-
# Shrink image for speed
|
| 87 |
shrink = int(np.floor(qsize / output_size * 0.5))
|
| 88 |
if shrink > 1:
|
| 89 |
-
rsize = (int(np.rint(
|
| 90 |
img = img.resize(rsize, Image.Resampling.LANCZOS)
|
| 91 |
quad /= shrink
|
| 92 |
qsize /= shrink
|
| 93 |
|
| 94 |
-
# Crop around face
|
| 95 |
border = max(int(np.rint(qsize * 0.1)), 3)
|
| 96 |
-
crop = (
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]:
|
| 101 |
img = img.crop(crop)
|
| 102 |
-
quad -= crop[
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
if enable_padding and max(pad) > border - 4:
|
| 110 |
pad = np.maximum(pad, int(np.rint(qsize * 0.3)))
|
| 111 |
img = np.pad(np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
|
| 112 |
h, w, _ = img.shape
|
| 113 |
y, x, _ = np.ogrid[:h, :w, :1]
|
| 114 |
-
mask = np.maximum(
|
| 115 |
-
|
|
|
|
|
|
|
| 116 |
blur = qsize * 0.02
|
| 117 |
img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
|
| 118 |
img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0)
|
| 119 |
img = Image.fromarray(np.uint8(np.clip(np.rint(img), 0, 255)), 'RGB')
|
| 120 |
quad += pad[:2]
|
| 121 |
|
| 122 |
-
# Transform image
|
| 123 |
img = img.transform((transform_size, transform_size), Image.QUAD, (quad + 0.5).flatten(), Image.BILINEAR)
|
| 124 |
if output_size < transform_size:
|
| 125 |
img = img.resize((output_size, output_size), Image.Resampling.LANCZOS)
|
| 126 |
|
| 127 |
-
|
| 128 |
-
return img
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
from PIL import Image
|
| 3 |
+
import scipy.ndimage
|
| 4 |
+
import insightface
|
| 5 |
import torch
|
|
|
|
|
|
|
| 6 |
import scipy
|
| 7 |
|
| 8 |
+
# Initialize InsightFace model
|
| 9 |
+
face_analyzer = insightface.app.FaceAnalysis(name='buffalo_l', providers=['CPUExecutionProvider'])
|
| 10 |
+
face_analyzer.prepare(ctx_id=0)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
def image_grid(imgs, rows, cols):
|
| 14 |
assert len(imgs) == rows*cols
|
| 15 |
|
|
|
|
| 36 |
|
| 37 |
return generator
|
| 38 |
|
| 39 |
+
def get_landmark_pil_insight(pil_image):
|
| 40 |
+
"""Get 68 facial landmarks using InsightFace."""
|
| 41 |
img_np = np.array(pil_image.convert("RGB"))
|
| 42 |
+
faces = face_analyzer.get(img_np)
|
| 43 |
+
if not faces:
|
| 44 |
return None
|
| 45 |
+
landmarks = faces[0].kps # shape: (5, 2) or (68, 2) depending on model
|
| 46 |
+
|
| 47 |
+
if landmarks.shape[0] < 68:
|
| 48 |
+
# InsightFace returns only 5 points: [left_eye, right_eye, nose, left_mouth, right_mouth]
|
| 49 |
+
left_eye, right_eye, nose, left_mouth, right_mouth = landmarks
|
| 50 |
+
# Approximate 68 landmarks (basic heuristic or fallback)
|
| 51 |
+
return np.array([
|
| 52 |
+
left_eye, right_eye, nose, left_mouth, right_mouth
|
| 53 |
+
])
|
| 54 |
+
return landmarks
|
| 55 |
+
|
| 56 |
+
def align_face(pil_image):
|
| 57 |
"""Align a face from a PIL.Image, returning an aligned PIL.Image of size 512x512."""
|
| 58 |
+
lm = get_landmark_pil_insight(pil_image)
|
| 59 |
+
if lm is None or lm.shape[0] < 5:
|
| 60 |
return pil_image
|
| 61 |
+
|
| 62 |
+
eye_left, eye_right = lm[0], lm[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
eye_avg = (eye_left + eye_right) * 0.5
|
| 64 |
eye_to_eye = eye_right - eye_left
|
| 65 |
+
mouth_left, mouth_right = lm[3], lm[4]
|
|
|
|
| 66 |
mouth_avg = (mouth_left + mouth_right) * 0.5
|
| 67 |
eye_to_mouth = mouth_avg - eye_avg
|
| 68 |
|
| 69 |
+
# The rest is your original alignment logic
|
| 70 |
x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
|
| 71 |
x /= np.hypot(*x)
|
| 72 |
x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
|
|
|
|
| 75 |
quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
|
| 76 |
qsize = np.hypot(*x) * 2
|
| 77 |
|
|
|
|
| 78 |
img = pil_image.convert("RGB")
|
| 79 |
transform_size = 512
|
| 80 |
output_size = 512
|
| 81 |
enable_padding = True
|
| 82 |
|
|
|
|
| 83 |
shrink = int(np.floor(qsize / output_size * 0.5))
|
| 84 |
if shrink > 1:
|
| 85 |
+
rsize = (int(np.rint(img.size[0] / shrink)), int(np.rint(img.size[1] / shrink)))
|
| 86 |
img = img.resize(rsize, Image.Resampling.LANCZOS)
|
| 87 |
quad /= shrink
|
| 88 |
qsize /= shrink
|
| 89 |
|
|
|
|
| 90 |
border = max(int(np.rint(qsize * 0.1)), 3)
|
| 91 |
+
crop = (
|
| 92 |
+
int(np.floor(min(quad[:, 0]))),
|
| 93 |
+
int(np.floor(min(quad[:, 1]))),
|
| 94 |
+
int(np.ceil(max(quad[:, 0]))),
|
| 95 |
+
int(np.ceil(max(quad[:, 1])))
|
| 96 |
+
)
|
| 97 |
+
crop = (
|
| 98 |
+
max(crop[0] - border, 0),
|
| 99 |
+
max(crop[1] - border, 0),
|
| 100 |
+
min(crop[2] + border, img.size[0]),
|
| 101 |
+
min(crop[3] + border, img.size[1])
|
| 102 |
+
)
|
| 103 |
if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]:
|
| 104 |
img = img.crop(crop)
|
| 105 |
+
quad -= crop[:2]
|
| 106 |
+
|
| 107 |
+
pad = (
|
| 108 |
+
int(np.floor(min(quad[:, 0]))),
|
| 109 |
+
int(np.floor(min(quad[:, 1]))),
|
| 110 |
+
int(np.ceil(max(quad[:, 0]))),
|
| 111 |
+
int(np.ceil(max(quad[:, 1])))
|
| 112 |
+
)
|
| 113 |
+
pad = (
|
| 114 |
+
max(-pad[0] + border, 0),
|
| 115 |
+
max(-pad[1] + border, 0),
|
| 116 |
+
max(pad[2] - img.size[0] + border, 0),
|
| 117 |
+
max(pad[3] - img.size[1] + border, 0)
|
| 118 |
+
)
|
| 119 |
if enable_padding and max(pad) > border - 4:
|
| 120 |
pad = np.maximum(pad, int(np.rint(qsize * 0.3)))
|
| 121 |
img = np.pad(np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
|
| 122 |
h, w, _ = img.shape
|
| 123 |
y, x, _ = np.ogrid[:h, :w, :1]
|
| 124 |
+
mask = np.maximum(
|
| 125 |
+
1.0 - np.minimum(np.float32(x) / pad[0], np.float32(w - 1 - x) / pad[2]),
|
| 126 |
+
1.0 - np.minimum(np.float32(y) / pad[1], np.float32(h - 1 - y) / pad[3])
|
| 127 |
+
)
|
| 128 |
blur = qsize * 0.02
|
| 129 |
img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
|
| 130 |
img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0)
|
| 131 |
img = Image.fromarray(np.uint8(np.clip(np.rint(img), 0, 255)), 'RGB')
|
| 132 |
quad += pad[:2]
|
| 133 |
|
|
|
|
| 134 |
img = img.transform((transform_size, transform_size), Image.QUAD, (quad + 0.5).flatten(), Image.BILINEAR)
|
| 135 |
if output_size < transform_size:
|
| 136 |
img = img.resize((output_size, output_size), Image.Resampling.LANCZOS)
|
| 137 |
|
| 138 |
+
return img
|
|
|