Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
399e621
1
Parent(s):
bcdb914
remove dlib dependency
Browse files- Dockerfile +0 -51
- README.md +2 -4
- app.py +2 -9
- requirements.txt +3 -1
- utils.py +62 -52
Dockerfile
DELETED
@@ -1,51 +0,0 @@
|
|
1 |
-
# FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
|
2 |
-
|
3 |
-
# RUN apt-get update && apt-get install -y \
|
4 |
-
# git cmake build-essential python3 python3-pip python3-dev \
|
5 |
-
# libboost-all-dev libopenblas-dev liblapack-dev
|
6 |
-
|
7 |
-
# WORKDIR /dlib
|
8 |
-
# RUN git clone https://github.com/davisking/dlib.git . \
|
9 |
-
# && mkdir build && cd build \
|
10 |
-
# && cmake .. -DDLIB_USE_CUDA=1 \
|
11 |
-
# && cmake --build . --config Release
|
12 |
-
|
13 |
-
# # install via pip so dependencies like wheel are used
|
14 |
-
# RUN pip install --upgrade pip wheel cmake setuptools && \
|
15 |
-
# cd /dlib && python3 setup.py install
|
16 |
-
|
17 |
-
# # Choose a writable directory, e.g., /home/user/huggingface
|
18 |
-
# RUN mkdir -p /home/user/huggingface
|
19 |
-
# RUN chmod -R 777 /home/user/huggingface
|
20 |
-
|
21 |
-
# # Then set environment variables
|
22 |
-
# ENV HF_HOME=/home/user/huggingface \
|
23 |
-
# HF_HUB_CACHE=/home/user/huggingface/hub \
|
24 |
-
# TRANSFORMERS_CACHE=/home/user/huggingface/hub
|
25 |
-
|
26 |
-
# WORKDIR /app
|
27 |
-
# COPY requirements.txt .
|
28 |
-
# RUN pip install -r requirements.txt
|
29 |
-
# COPY . .
|
30 |
-
# CMD ["python3", "app.py"]
|
31 |
-
|
32 |
-
|
33 |
-
FROM dillondrobena/opencv-cuda
|
34 |
-
|
35 |
-
WORKDIR /app
|
36 |
-
|
37 |
-
# Choose a writable directory, e.g., /home/user/huggingface
|
38 |
-
RUN mkdir -p /home/user/huggingface
|
39 |
-
RUN chmod -R 777 /home/user/huggingface
|
40 |
-
|
41 |
-
# Then set environment variables
|
42 |
-
ENV HF_HOME=/home/user/huggingface \
|
43 |
-
HF_HUB_CACHE=/home/user/huggingface/hub \
|
44 |
-
TRANSFORMERS_CACHE=/home/user/huggingface/hub
|
45 |
-
|
46 |
-
COPY requirements.txt .
|
47 |
-
RUN pip install --upgrade pip \
|
48 |
-
&& pip install -r requirements.txt
|
49 |
-
|
50 |
-
COPY . .
|
51 |
-
CMD ["python", "app.py"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -3,10 +3,8 @@ title: "Nested Attention: Semantic-aware Attention Values for Concept Personaliz
|
|
3 |
emoji: 🚀
|
4 |
colorFrom: indigo
|
5 |
colorTo: pink
|
6 |
-
|
7 |
-
|
8 |
-
sdk: docker
|
9 |
-
app_port: 7860
|
10 |
pinned: false
|
11 |
---
|
12 |
|
|
|
3 |
emoji: 🚀
|
4 |
colorFrom: indigo
|
5 |
colorTo: pink
|
6 |
+
sdk: gradio
|
7 |
+
app_file: app.py
|
|
|
|
|
8 |
pinned: false
|
9 |
---
|
10 |
|
app.py
CHANGED
@@ -5,14 +5,11 @@ import gradio as gr
|
|
5 |
from huggingface_hub import hf_hub_download, snapshot_download
|
6 |
from nested_attention_pipeline import NestedAdapterInference, add_special_token_to_tokenizer
|
7 |
from utils import align_face
|
8 |
-
import dlib
|
9 |
|
10 |
|
11 |
# ----------------------
|
12 |
# Configuration (update paths as needed)
|
13 |
# ----------------------
|
14 |
-
SHAPE_PREDICTOR_PATH = hf_hub_download("orpatashnik/NestedAttentionEncoder", "shape_predictor_68_face_landmarks.dat")
|
15 |
-
FACE_DETECTOR_PATH = hf_hub_download("orpatashnik/NestedAttentionEncoder", "mmod_human_face_detector.dat")
|
16 |
base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
|
17 |
image_encoder_path = snapshot_download("orpatashnik/NestedAttentionEncoder", allow_patterns=["image_encoder/**"])
|
18 |
image_encoder_path = os.path.join(image_encoder_path, "image_encoder")
|
@@ -40,10 +37,6 @@ ip_model = NestedAdapterInference(
|
|
40 |
device=device
|
41 |
)
|
42 |
|
43 |
-
# Initialize face alignment predictor
|
44 |
-
predictor = dlib.shape_predictor(SHAPE_PREDICTOR_PATH)
|
45 |
-
detector = dlib.cnn_face_detection_model_v1(FACE_DETECTOR_PATH)
|
46 |
-
|
47 |
# Generation defaults
|
48 |
negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
|
49 |
num_inference_steps = 30
|
@@ -59,7 +52,7 @@ def generate_images(img1, img2, img3, prompt, w, num_samples, seed):
|
|
59 |
return []
|
60 |
|
61 |
# Align directly on PIL
|
62 |
-
aligned_refs = [align_face(img
|
63 |
|
64 |
# Resize to model resolution
|
65 |
pil_images = [aligned.resize((512, 512)) for aligned in aligned_refs]
|
@@ -111,4 +104,4 @@ with gr.Blocks() as demo:
|
|
111 |
outputs=output_gallery
|
112 |
)
|
113 |
|
114 |
-
demo.launch()
|
|
|
5 |
from huggingface_hub import hf_hub_download, snapshot_download
|
6 |
from nested_attention_pipeline import NestedAdapterInference, add_special_token_to_tokenizer
|
7 |
from utils import align_face
|
|
|
8 |
|
9 |
|
10 |
# ----------------------
|
11 |
# Configuration (update paths as needed)
|
12 |
# ----------------------
|
|
|
|
|
13 |
base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
|
14 |
image_encoder_path = snapshot_download("orpatashnik/NestedAttentionEncoder", allow_patterns=["image_encoder/**"])
|
15 |
image_encoder_path = os.path.join(image_encoder_path, "image_encoder")
|
|
|
37 |
device=device
|
38 |
)
|
39 |
|
|
|
|
|
|
|
|
|
40 |
# Generation defaults
|
41 |
negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
|
42 |
num_inference_steps = 30
|
|
|
52 |
return []
|
53 |
|
54 |
# Align directly on PIL
|
55 |
+
aligned_refs = [align_face(img) for img in refs]
|
56 |
|
57 |
# Resize to model resolution
|
58 |
pil_images = [aligned.resize((512, 512)) for aligned in aligned_refs]
|
|
|
104 |
outputs=output_gallery
|
105 |
)
|
106 |
|
107 |
+
demo.launch(share=True)
|
requirements.txt
CHANGED
@@ -6,4 +6,6 @@ gradio
|
|
6 |
huggingface_hub
|
7 |
einops
|
8 |
scipy
|
9 |
-
accelerate
|
|
|
|
|
|
6 |
huggingface_hub
|
7 |
einops
|
8 |
scipy
|
9 |
+
accelerate
|
10 |
+
insightface
|
11 |
+
onnxruntime
|
utils.py
CHANGED
@@ -1,9 +1,15 @@
|
|
|
|
1 |
from PIL import Image
|
|
|
|
|
2 |
import torch
|
3 |
-
import numpy as np
|
4 |
-
import dlib
|
5 |
import scipy
|
6 |
|
|
|
|
|
|
|
|
|
|
|
7 |
def image_grid(imgs, rows, cols):
|
8 |
assert len(imgs) == rows*cols
|
9 |
|
@@ -30,45 +36,37 @@ def get_generator(seed, device):
|
|
30 |
|
31 |
return generator
|
32 |
|
33 |
-
def
|
34 |
-
"""Get 68 facial landmarks
|
35 |
img_np = np.array(pil_image.convert("RGB"))
|
36 |
-
|
37 |
-
if not
|
38 |
return None
|
39 |
-
#
|
40 |
-
|
41 |
-
shape
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
47 |
"""Align a face from a PIL.Image, returning an aligned PIL.Image of size 512x512."""
|
48 |
-
lm =
|
49 |
-
if lm is None:
|
50 |
return pil_image
|
51 |
-
|
52 |
-
|
53 |
-
lm_eyebrow_left = lm[17: 22] # left-right
|
54 |
-
lm_eyebrow_right = lm[22: 27] # left-right
|
55 |
-
lm_nose = lm[27: 31] # top-down
|
56 |
-
lm_nostrils = lm[31: 36] # top-down
|
57 |
-
lm_eye_left = lm[36: 42] # left-clockwise
|
58 |
-
lm_eye_right = lm[42: 48] # left-clockwise
|
59 |
-
lm_mouth_outer = lm[48: 60] # left-clockwise
|
60 |
-
lm_mouth_inner = lm[60: 68] # left-clockwise
|
61 |
-
|
62 |
-
eye_left = np.mean(lm_eye_left, axis=0)
|
63 |
-
eye_right = np.mean(lm_eye_right, axis=0)
|
64 |
eye_avg = (eye_left + eye_right) * 0.5
|
65 |
eye_to_eye = eye_right - eye_left
|
66 |
-
mouth_left =
|
67 |
-
mouth_right = lm_mouth_outer[6]
|
68 |
mouth_avg = (mouth_left + mouth_right) * 0.5
|
69 |
eye_to_mouth = mouth_avg - eye_avg
|
70 |
|
71 |
-
#
|
72 |
x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
|
73 |
x /= np.hypot(*x)
|
74 |
x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
|
@@ -77,52 +75,64 @@ def align_face(pil_image, predictor, detector):
|
|
77 |
quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
|
78 |
qsize = np.hypot(*x) * 2
|
79 |
|
80 |
-
# Prepare image
|
81 |
img = pil_image.convert("RGB")
|
82 |
transform_size = 512
|
83 |
output_size = 512
|
84 |
enable_padding = True
|
85 |
|
86 |
-
# Shrink image for speed
|
87 |
shrink = int(np.floor(qsize / output_size * 0.5))
|
88 |
if shrink > 1:
|
89 |
-
rsize = (int(np.rint(
|
90 |
img = img.resize(rsize, Image.Resampling.LANCZOS)
|
91 |
quad /= shrink
|
92 |
qsize /= shrink
|
93 |
|
94 |
-
# Crop around face
|
95 |
border = max(int(np.rint(qsize * 0.1)), 3)
|
96 |
-
crop = (
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]:
|
101 |
img = img.crop(crop)
|
102 |
-
quad -= crop[
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
if enable_padding and max(pad) > border - 4:
|
110 |
pad = np.maximum(pad, int(np.rint(qsize * 0.3)))
|
111 |
img = np.pad(np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
|
112 |
h, w, _ = img.shape
|
113 |
y, x, _ = np.ogrid[:h, :w, :1]
|
114 |
-
mask = np.maximum(
|
115 |
-
|
|
|
|
|
116 |
blur = qsize * 0.02
|
117 |
img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
|
118 |
img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0)
|
119 |
img = Image.fromarray(np.uint8(np.clip(np.rint(img), 0, 255)), 'RGB')
|
120 |
quad += pad[:2]
|
121 |
|
122 |
-
# Transform image
|
123 |
img = img.transform((transform_size, transform_size), Image.QUAD, (quad + 0.5).flatten(), Image.BILINEAR)
|
124 |
if output_size < transform_size:
|
125 |
img = img.resize((output_size, output_size), Image.Resampling.LANCZOS)
|
126 |
|
127 |
-
|
128 |
-
return img
|
|
|
1 |
+
import numpy as np
|
2 |
from PIL import Image
|
3 |
+
import scipy.ndimage
|
4 |
+
import insightface
|
5 |
import torch
|
|
|
|
|
6 |
import scipy
|
7 |
|
8 |
+
# Initialize InsightFace model
|
9 |
+
face_analyzer = insightface.app.FaceAnalysis(name='buffalo_l', providers=['CPUExecutionProvider'])
|
10 |
+
face_analyzer.prepare(ctx_id=0)
|
11 |
+
|
12 |
+
|
13 |
def image_grid(imgs, rows, cols):
|
14 |
assert len(imgs) == rows*cols
|
15 |
|
|
|
36 |
|
37 |
return generator
|
38 |
|
39 |
+
def get_landmark_pil_insight(pil_image):
|
40 |
+
"""Get 68 facial landmarks using InsightFace."""
|
41 |
img_np = np.array(pil_image.convert("RGB"))
|
42 |
+
faces = face_analyzer.get(img_np)
|
43 |
+
if not faces:
|
44 |
return None
|
45 |
+
landmarks = faces[0].kps # shape: (5, 2) or (68, 2) depending on model
|
46 |
+
|
47 |
+
if landmarks.shape[0] < 68:
|
48 |
+
# InsightFace returns only 5 points: [left_eye, right_eye, nose, left_mouth, right_mouth]
|
49 |
+
left_eye, right_eye, nose, left_mouth, right_mouth = landmarks
|
50 |
+
# Approximate 68 landmarks (basic heuristic or fallback)
|
51 |
+
return np.array([
|
52 |
+
left_eye, right_eye, nose, left_mouth, right_mouth
|
53 |
+
])
|
54 |
+
return landmarks
|
55 |
+
|
56 |
+
def align_face(pil_image):
|
57 |
"""Align a face from a PIL.Image, returning an aligned PIL.Image of size 512x512."""
|
58 |
+
lm = get_landmark_pil_insight(pil_image)
|
59 |
+
if lm is None or lm.shape[0] < 5:
|
60 |
return pil_image
|
61 |
+
|
62 |
+
eye_left, eye_right = lm[0], lm[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
eye_avg = (eye_left + eye_right) * 0.5
|
64 |
eye_to_eye = eye_right - eye_left
|
65 |
+
mouth_left, mouth_right = lm[3], lm[4]
|
|
|
66 |
mouth_avg = (mouth_left + mouth_right) * 0.5
|
67 |
eye_to_mouth = mouth_avg - eye_avg
|
68 |
|
69 |
+
# The rest is your original alignment logic
|
70 |
x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
|
71 |
x /= np.hypot(*x)
|
72 |
x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
|
|
|
75 |
quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
|
76 |
qsize = np.hypot(*x) * 2
|
77 |
|
|
|
78 |
img = pil_image.convert("RGB")
|
79 |
transform_size = 512
|
80 |
output_size = 512
|
81 |
enable_padding = True
|
82 |
|
|
|
83 |
shrink = int(np.floor(qsize / output_size * 0.5))
|
84 |
if shrink > 1:
|
85 |
+
rsize = (int(np.rint(img.size[0] / shrink)), int(np.rint(img.size[1] / shrink)))
|
86 |
img = img.resize(rsize, Image.Resampling.LANCZOS)
|
87 |
quad /= shrink
|
88 |
qsize /= shrink
|
89 |
|
|
|
90 |
border = max(int(np.rint(qsize * 0.1)), 3)
|
91 |
+
crop = (
|
92 |
+
int(np.floor(min(quad[:, 0]))),
|
93 |
+
int(np.floor(min(quad[:, 1]))),
|
94 |
+
int(np.ceil(max(quad[:, 0]))),
|
95 |
+
int(np.ceil(max(quad[:, 1])))
|
96 |
+
)
|
97 |
+
crop = (
|
98 |
+
max(crop[0] - border, 0),
|
99 |
+
max(crop[1] - border, 0),
|
100 |
+
min(crop[2] + border, img.size[0]),
|
101 |
+
min(crop[3] + border, img.size[1])
|
102 |
+
)
|
103 |
if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]:
|
104 |
img = img.crop(crop)
|
105 |
+
quad -= crop[:2]
|
106 |
+
|
107 |
+
pad = (
|
108 |
+
int(np.floor(min(quad[:, 0]))),
|
109 |
+
int(np.floor(min(quad[:, 1]))),
|
110 |
+
int(np.ceil(max(quad[:, 0]))),
|
111 |
+
int(np.ceil(max(quad[:, 1])))
|
112 |
+
)
|
113 |
+
pad = (
|
114 |
+
max(-pad[0] + border, 0),
|
115 |
+
max(-pad[1] + border, 0),
|
116 |
+
max(pad[2] - img.size[0] + border, 0),
|
117 |
+
max(pad[3] - img.size[1] + border, 0)
|
118 |
+
)
|
119 |
if enable_padding and max(pad) > border - 4:
|
120 |
pad = np.maximum(pad, int(np.rint(qsize * 0.3)))
|
121 |
img = np.pad(np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
|
122 |
h, w, _ = img.shape
|
123 |
y, x, _ = np.ogrid[:h, :w, :1]
|
124 |
+
mask = np.maximum(
|
125 |
+
1.0 - np.minimum(np.float32(x) / pad[0], np.float32(w - 1 - x) / pad[2]),
|
126 |
+
1.0 - np.minimum(np.float32(y) / pad[1], np.float32(h - 1 - y) / pad[3])
|
127 |
+
)
|
128 |
blur = qsize * 0.02
|
129 |
img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
|
130 |
img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0)
|
131 |
img = Image.fromarray(np.uint8(np.clip(np.rint(img), 0, 255)), 'RGB')
|
132 |
quad += pad[:2]
|
133 |
|
|
|
134 |
img = img.transform((transform_size, transform_size), Image.QUAD, (quad + 0.5).flatten(), Image.BILINEAR)
|
135 |
if output_size < transform_size:
|
136 |
img = img.resize((output_size, output_size), Image.Resampling.LANCZOS)
|
137 |
|
138 |
+
return img
|
|