orpatashnik commited on
Commit
399e621
·
1 Parent(s): bcdb914

remove dlib dependency

Browse files
Files changed (5) hide show
  1. Dockerfile +0 -51
  2. README.md +2 -4
  3. app.py +2 -9
  4. requirements.txt +3 -1
  5. utils.py +62 -52
Dockerfile DELETED
@@ -1,51 +0,0 @@
1
- # FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
2
-
3
- # RUN apt-get update && apt-get install -y \
4
- # git cmake build-essential python3 python3-pip python3-dev \
5
- # libboost-all-dev libopenblas-dev liblapack-dev
6
-
7
- # WORKDIR /dlib
8
- # RUN git clone https://github.com/davisking/dlib.git . \
9
- # && mkdir build && cd build \
10
- # && cmake .. -DDLIB_USE_CUDA=1 \
11
- # && cmake --build . --config Release
12
-
13
- # # install via pip so dependencies like wheel are used
14
- # RUN pip install --upgrade pip wheel cmake setuptools && \
15
- # cd /dlib && python3 setup.py install
16
-
17
- # # Choose a writable directory, e.g., /home/user/huggingface
18
- # RUN mkdir -p /home/user/huggingface
19
- # RUN chmod -R 777 /home/user/huggingface
20
-
21
- # # Then set environment variables
22
- # ENV HF_HOME=/home/user/huggingface \
23
- # HF_HUB_CACHE=/home/user/huggingface/hub \
24
- # TRANSFORMERS_CACHE=/home/user/huggingface/hub
25
-
26
- # WORKDIR /app
27
- # COPY requirements.txt .
28
- # RUN pip install -r requirements.txt
29
- # COPY . .
30
- # CMD ["python3", "app.py"]
31
-
32
-
33
- FROM dillondrobena/opencv-cuda
34
-
35
- WORKDIR /app
36
-
37
- # Choose a writable directory, e.g., /home/user/huggingface
38
- RUN mkdir -p /home/user/huggingface
39
- RUN chmod -R 777 /home/user/huggingface
40
-
41
- # Then set environment variables
42
- ENV HF_HOME=/home/user/huggingface \
43
- HF_HUB_CACHE=/home/user/huggingface/hub \
44
- TRANSFORMERS_CACHE=/home/user/huggingface/hub
45
-
46
- COPY requirements.txt .
47
- RUN pip install --upgrade pip \
48
- && pip install -r requirements.txt
49
-
50
- COPY . .
51
- CMD ["python", "app.py"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -3,10 +3,8 @@ title: "Nested Attention: Semantic-aware Attention Values for Concept Personaliz
3
  emoji: 🚀
4
  colorFrom: indigo
5
  colorTo: pink
6
- # sdk: gradio
7
- # app_file: app.py
8
- sdk: docker
9
- app_port: 7860
10
  pinned: false
11
  ---
12
 
 
3
  emoji: 🚀
4
  colorFrom: indigo
5
  colorTo: pink
6
+ sdk: gradio
7
+ app_file: app.py
 
 
8
  pinned: false
9
  ---
10
 
app.py CHANGED
@@ -5,14 +5,11 @@ import gradio as gr
5
  from huggingface_hub import hf_hub_download, snapshot_download
6
  from nested_attention_pipeline import NestedAdapterInference, add_special_token_to_tokenizer
7
  from utils import align_face
8
- import dlib
9
 
10
 
11
  # ----------------------
12
  # Configuration (update paths as needed)
13
  # ----------------------
14
- SHAPE_PREDICTOR_PATH = hf_hub_download("orpatashnik/NestedAttentionEncoder", "shape_predictor_68_face_landmarks.dat")
15
- FACE_DETECTOR_PATH = hf_hub_download("orpatashnik/NestedAttentionEncoder", "mmod_human_face_detector.dat")
16
  base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
17
  image_encoder_path = snapshot_download("orpatashnik/NestedAttentionEncoder", allow_patterns=["image_encoder/**"])
18
  image_encoder_path = os.path.join(image_encoder_path, "image_encoder")
@@ -40,10 +37,6 @@ ip_model = NestedAdapterInference(
40
  device=device
41
  )
42
 
43
- # Initialize face alignment predictor
44
- predictor = dlib.shape_predictor(SHAPE_PREDICTOR_PATH)
45
- detector = dlib.cnn_face_detection_model_v1(FACE_DETECTOR_PATH)
46
-
47
  # Generation defaults
48
  negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
49
  num_inference_steps = 30
@@ -59,7 +52,7 @@ def generate_images(img1, img2, img3, prompt, w, num_samples, seed):
59
  return []
60
 
61
  # Align directly on PIL
62
- aligned_refs = [align_face(img, predictor, detector) for img in refs]
63
 
64
  # Resize to model resolution
65
  pil_images = [aligned.resize((512, 512)) for aligned in aligned_refs]
@@ -111,4 +104,4 @@ with gr.Blocks() as demo:
111
  outputs=output_gallery
112
  )
113
 
114
- demo.launch()
 
5
  from huggingface_hub import hf_hub_download, snapshot_download
6
  from nested_attention_pipeline import NestedAdapterInference, add_special_token_to_tokenizer
7
  from utils import align_face
 
8
 
9
 
10
  # ----------------------
11
  # Configuration (update paths as needed)
12
  # ----------------------
 
 
13
  base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
14
  image_encoder_path = snapshot_download("orpatashnik/NestedAttentionEncoder", allow_patterns=["image_encoder/**"])
15
  image_encoder_path = os.path.join(image_encoder_path, "image_encoder")
 
37
  device=device
38
  )
39
 
 
 
 
 
40
  # Generation defaults
41
  negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
42
  num_inference_steps = 30
 
52
  return []
53
 
54
  # Align directly on PIL
55
+ aligned_refs = [align_face(img) for img in refs]
56
 
57
  # Resize to model resolution
58
  pil_images = [aligned.resize((512, 512)) for aligned in aligned_refs]
 
104
  outputs=output_gallery
105
  )
106
 
107
+ demo.launch(share=True)
requirements.txt CHANGED
@@ -6,4 +6,6 @@ gradio
6
  huggingface_hub
7
  einops
8
  scipy
9
- accelerate
 
 
 
6
  huggingface_hub
7
  einops
8
  scipy
9
+ accelerate
10
+ insightface
11
+ onnxruntime
utils.py CHANGED
@@ -1,9 +1,15 @@
 
1
  from PIL import Image
 
 
2
  import torch
3
- import numpy as np
4
- import dlib
5
  import scipy
6
 
 
 
 
 
 
7
  def image_grid(imgs, rows, cols):
8
  assert len(imgs) == rows*cols
9
 
@@ -30,45 +36,37 @@ def get_generator(seed, device):
30
 
31
  return generator
32
 
33
- def get_landmark_pil(pil_image, predictor, detector):
34
- """Get 68 facial landmarks as a NumPy array of shape (68, 2)."""
35
  img_np = np.array(pil_image.convert("RGB"))
36
- dets = detector(img_np, 1)
37
- if not dets:
38
  return None
39
- # Handle mmod or frontal detector output
40
- det = dets[0].rect if hasattr(dets[0], 'rect') else dets[0]
41
- shape = predictor(img_np, det)
42
- coords = [(pt.x, pt.y) for pt in shape.parts()]
43
- return np.array(coords)
44
-
45
-
46
- def align_face(pil_image, predictor, detector):
 
 
 
 
47
  """Align a face from a PIL.Image, returning an aligned PIL.Image of size 512x512."""
48
- lm = get_landmark_pil(pil_image, predictor, detector)
49
- if lm is None:
50
  return pil_image
51
- # Define landmark regions
52
- lm_chin = lm[0: 17] # left-right
53
- lm_eyebrow_left = lm[17: 22] # left-right
54
- lm_eyebrow_right = lm[22: 27] # left-right
55
- lm_nose = lm[27: 31] # top-down
56
- lm_nostrils = lm[31: 36] # top-down
57
- lm_eye_left = lm[36: 42] # left-clockwise
58
- lm_eye_right = lm[42: 48] # left-clockwise
59
- lm_mouth_outer = lm[48: 60] # left-clockwise
60
- lm_mouth_inner = lm[60: 68] # left-clockwise
61
-
62
- eye_left = np.mean(lm_eye_left, axis=0)
63
- eye_right = np.mean(lm_eye_right, axis=0)
64
  eye_avg = (eye_left + eye_right) * 0.5
65
  eye_to_eye = eye_right - eye_left
66
- mouth_left = lm_mouth_outer[0]
67
- mouth_right = lm_mouth_outer[6]
68
  mouth_avg = (mouth_left + mouth_right) * 0.5
69
  eye_to_mouth = mouth_avg - eye_avg
70
 
71
- # Compute oriented crop
72
  x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
73
  x /= np.hypot(*x)
74
  x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
@@ -77,52 +75,64 @@ def align_face(pil_image, predictor, detector):
77
  quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
78
  qsize = np.hypot(*x) * 2
79
 
80
- # Prepare image
81
  img = pil_image.convert("RGB")
82
  transform_size = 512
83
  output_size = 512
84
  enable_padding = True
85
 
86
- # Shrink image for speed
87
  shrink = int(np.floor(qsize / output_size * 0.5))
88
  if shrink > 1:
89
- rsize = (int(np.rint(float(img.size[0]) / shrink)), int(np.rint(float(img.size[1]) / shrink)))
90
  img = img.resize(rsize, Image.Resampling.LANCZOS)
91
  quad /= shrink
92
  qsize /= shrink
93
 
94
- # Crop around face
95
  border = max(int(np.rint(qsize * 0.1)), 3)
96
- crop = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))),
97
- int(np.ceil(max(quad[:, 1]))))
98
- crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), min(crop[2] + border, img.size[0]),
99
- min(crop[3] + border, img.size[1]))
 
 
 
 
 
 
 
 
100
  if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]:
101
  img = img.crop(crop)
102
- quad -= crop[0:2]
103
-
104
- # Pad
105
- pad = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))),
106
- int(np.ceil(max(quad[:, 1]))))
107
- pad = (max(-pad[0] + border, 0), max(-pad[1] + border, 0), max(pad[2] - img.size[0] + border, 0),
108
- max(pad[3] - img.size[1] + border, 0))
 
 
 
 
 
 
 
109
  if enable_padding and max(pad) > border - 4:
110
  pad = np.maximum(pad, int(np.rint(qsize * 0.3)))
111
  img = np.pad(np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
112
  h, w, _ = img.shape
113
  y, x, _ = np.ogrid[:h, :w, :1]
114
- mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0], np.float32(w - 1 - x) / pad[2]),
115
- 1.0 - np.minimum(np.float32(y) / pad[1], np.float32(h - 1 - y) / pad[3]))
 
 
116
  blur = qsize * 0.02
117
  img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
118
  img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0)
119
  img = Image.fromarray(np.uint8(np.clip(np.rint(img), 0, 255)), 'RGB')
120
  quad += pad[:2]
121
 
122
- # Transform image
123
  img = img.transform((transform_size, transform_size), Image.QUAD, (quad + 0.5).flatten(), Image.BILINEAR)
124
  if output_size < transform_size:
125
  img = img.resize((output_size, output_size), Image.Resampling.LANCZOS)
126
 
127
- # Resize to final output
128
- return img
 
1
+ import numpy as np
2
  from PIL import Image
3
+ import scipy.ndimage
4
+ import insightface
5
  import torch
 
 
6
  import scipy
7
 
8
+ # Initialize InsightFace model
9
+ face_analyzer = insightface.app.FaceAnalysis(name='buffalo_l', providers=['CPUExecutionProvider'])
10
+ face_analyzer.prepare(ctx_id=0)
11
+
12
+
13
  def image_grid(imgs, rows, cols):
14
  assert len(imgs) == rows*cols
15
 
 
36
 
37
  return generator
38
 
39
+ def get_landmark_pil_insight(pil_image):
40
+ """Get 68 facial landmarks using InsightFace."""
41
  img_np = np.array(pil_image.convert("RGB"))
42
+ faces = face_analyzer.get(img_np)
43
+ if not faces:
44
  return None
45
+ landmarks = faces[0].kps # shape: (5, 2) or (68, 2) depending on model
46
+
47
+ if landmarks.shape[0] < 68:
48
+ # InsightFace returns only 5 points: [left_eye, right_eye, nose, left_mouth, right_mouth]
49
+ left_eye, right_eye, nose, left_mouth, right_mouth = landmarks
50
+ # Approximate 68 landmarks (basic heuristic or fallback)
51
+ return np.array([
52
+ left_eye, right_eye, nose, left_mouth, right_mouth
53
+ ])
54
+ return landmarks
55
+
56
+ def align_face(pil_image):
57
  """Align a face from a PIL.Image, returning an aligned PIL.Image of size 512x512."""
58
+ lm = get_landmark_pil_insight(pil_image)
59
+ if lm is None or lm.shape[0] < 5:
60
  return pil_image
61
+
62
+ eye_left, eye_right = lm[0], lm[1]
 
 
 
 
 
 
 
 
 
 
 
63
  eye_avg = (eye_left + eye_right) * 0.5
64
  eye_to_eye = eye_right - eye_left
65
+ mouth_left, mouth_right = lm[3], lm[4]
 
66
  mouth_avg = (mouth_left + mouth_right) * 0.5
67
  eye_to_mouth = mouth_avg - eye_avg
68
 
69
+ # The rest is your original alignment logic
70
  x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
71
  x /= np.hypot(*x)
72
  x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
 
75
  quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
76
  qsize = np.hypot(*x) * 2
77
 
 
78
  img = pil_image.convert("RGB")
79
  transform_size = 512
80
  output_size = 512
81
  enable_padding = True
82
 
 
83
  shrink = int(np.floor(qsize / output_size * 0.5))
84
  if shrink > 1:
85
+ rsize = (int(np.rint(img.size[0] / shrink)), int(np.rint(img.size[1] / shrink)))
86
  img = img.resize(rsize, Image.Resampling.LANCZOS)
87
  quad /= shrink
88
  qsize /= shrink
89
 
 
90
  border = max(int(np.rint(qsize * 0.1)), 3)
91
+ crop = (
92
+ int(np.floor(min(quad[:, 0]))),
93
+ int(np.floor(min(quad[:, 1]))),
94
+ int(np.ceil(max(quad[:, 0]))),
95
+ int(np.ceil(max(quad[:, 1])))
96
+ )
97
+ crop = (
98
+ max(crop[0] - border, 0),
99
+ max(crop[1] - border, 0),
100
+ min(crop[2] + border, img.size[0]),
101
+ min(crop[3] + border, img.size[1])
102
+ )
103
  if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]:
104
  img = img.crop(crop)
105
+ quad -= crop[:2]
106
+
107
+ pad = (
108
+ int(np.floor(min(quad[:, 0]))),
109
+ int(np.floor(min(quad[:, 1]))),
110
+ int(np.ceil(max(quad[:, 0]))),
111
+ int(np.ceil(max(quad[:, 1])))
112
+ )
113
+ pad = (
114
+ max(-pad[0] + border, 0),
115
+ max(-pad[1] + border, 0),
116
+ max(pad[2] - img.size[0] + border, 0),
117
+ max(pad[3] - img.size[1] + border, 0)
118
+ )
119
  if enable_padding and max(pad) > border - 4:
120
  pad = np.maximum(pad, int(np.rint(qsize * 0.3)))
121
  img = np.pad(np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
122
  h, w, _ = img.shape
123
  y, x, _ = np.ogrid[:h, :w, :1]
124
+ mask = np.maximum(
125
+ 1.0 - np.minimum(np.float32(x) / pad[0], np.float32(w - 1 - x) / pad[2]),
126
+ 1.0 - np.minimum(np.float32(y) / pad[1], np.float32(h - 1 - y) / pad[3])
127
+ )
128
  blur = qsize * 0.02
129
  img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
130
  img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0)
131
  img = Image.fromarray(np.uint8(np.clip(np.rint(img), 0, 255)), 'RGB')
132
  quad += pad[:2]
133
 
 
134
  img = img.transform((transform_size, transform_size), Image.QUAD, (quad + 0.5).flatten(), Image.BILINEAR)
135
  if output_size < transform_size:
136
  img = img.resize((output_size, output_size), Image.Resampling.LANCZOS)
137
 
138
+ return img