Spaces:

orpatashnik
/

NestedAttentionEncoder

Sleeping

App Files Files Community

orpatashnik commited on Aug 1

Commit

399e621

1 Parent(s): bcdb914

remove dlib dependency

Browse files

Files changed (5) hide show

Dockerfile +0 -51
README.md +2 -4
app.py +2 -9
requirements.txt +3 -1
utils.py +62 -52

Dockerfile DELETED Viewed

@@ -1,51 +0,0 @@
-# FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
-# RUN apt-get update && apt-get install -y \
-#     git cmake build-essential python3 python3-pip python3-dev \
-#     libboost-all-dev libopenblas-dev liblapack-dev
-# WORKDIR /dlib
-# RUN git clone https://github.com/davisking/dlib.git . \
-#   && mkdir build && cd build \
-#   && cmake .. -DDLIB_USE_CUDA=1 \
-#   && cmake --build . --config Release
-# # install via pip so dependencies like wheel are used
-# RUN pip install --upgrade pip wheel cmake setuptools && \
-#     cd /dlib && python3 setup.py install
-# # Choose a writable directory, e.g., /home/user/huggingface
-# RUN mkdir -p /home/user/huggingface
-# RUN chmod -R 777 /home/user/huggingface
-# # Then set environment variables
-# ENV HF_HOME=/home/user/huggingface \
-#     HF_HUB_CACHE=/home/user/huggingface/hub \
-#     TRANSFORMERS_CACHE=/home/user/huggingface/hub
-# WORKDIR /app
-# COPY requirements.txt .
-# RUN pip install -r requirements.txt
-# COPY . .
-# CMD ["python3", "app.py"]
-FROM dillondrobena/opencv-cuda
-WORKDIR /app
-# Choose a writable directory, e.g., /home/user/huggingface
-RUN mkdir -p /home/user/huggingface
-RUN chmod -R 777 /home/user/huggingface
-# Then set environment variables
-ENV HF_HOME=/home/user/huggingface \
-    HF_HUB_CACHE=/home/user/huggingface/hub \
-    TRANSFORMERS_CACHE=/home/user/huggingface/hub
-COPY requirements.txt .
-RUN pip install --upgrade pip \
-    && pip install -r requirements.txt
-COPY . .
-CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -3,10 +3,8 @@ title: "Nested Attention: Semantic-aware Attention Values for Concept Personaliz
 emoji: 🚀
 colorFrom: indigo
 colorTo: pink
-# sdk: gradio
-# app_file: app.py
-sdk: docker
-app_port: 7860
 pinned: false
 ---

 emoji: 🚀
 colorFrom: indigo
 colorTo: pink
+sdk: gradio
+app_file: app.py
 pinned: false
 ---

app.py CHANGED Viewed

@@ -5,14 +5,11 @@ import gradio as gr
 from huggingface_hub import hf_hub_download, snapshot_download
 from nested_attention_pipeline import NestedAdapterInference, add_special_token_to_tokenizer
 from utils import align_face
-import dlib
 # ----------------------
 # Configuration (update paths as needed)
 # ----------------------
-SHAPE_PREDICTOR_PATH = hf_hub_download("orpatashnik/NestedAttentionEncoder", "shape_predictor_68_face_landmarks.dat")
-FACE_DETECTOR_PATH = hf_hub_download("orpatashnik/NestedAttentionEncoder", "mmod_human_face_detector.dat")
 base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
 image_encoder_path = snapshot_download("orpatashnik/NestedAttentionEncoder", allow_patterns=["image_encoder/**"])
 image_encoder_path = os.path.join(image_encoder_path, "image_encoder")
@@ -40,10 +37,6 @@ ip_model = NestedAdapterInference(
     device=device
 )
-# Initialize face alignment predictor
-predictor = dlib.shape_predictor(SHAPE_PREDICTOR_PATH)
-detector = dlib.cnn_face_detection_model_v1(FACE_DETECTOR_PATH)
 # Generation defaults
 negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
 num_inference_steps = 30
@@ -59,7 +52,7 @@ def generate_images(img1, img2, img3, prompt, w, num_samples, seed):
         return []
     # Align directly on PIL
-    aligned_refs = [align_face(img, predictor, detector) for img in refs]
     # Resize to model resolution
     pil_images = [aligned.resize((512, 512)) for aligned in aligned_refs]
@@ -111,4 +104,4 @@ with gr.Blocks() as demo:
         outputs=output_gallery
     )
-demo.launch()

 from huggingface_hub import hf_hub_download, snapshot_download
 from nested_attention_pipeline import NestedAdapterInference, add_special_token_to_tokenizer
 from utils import align_face
 # ----------------------
 # Configuration (update paths as needed)
 # ----------------------
 base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
 image_encoder_path = snapshot_download("orpatashnik/NestedAttentionEncoder", allow_patterns=["image_encoder/**"])
 image_encoder_path = os.path.join(image_encoder_path, "image_encoder")
     device=device
 )
 # Generation defaults
 negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
 num_inference_steps = 30
         return []
     # Align directly on PIL
+    aligned_refs = [align_face(img) for img in refs]
     # Resize to model resolution
     pil_images = [aligned.resize((512, 512)) for aligned in aligned_refs]
         outputs=output_gallery
     )
+demo.launch(share=True)

requirements.txt CHANGED Viewed

@@ -6,4 +6,6 @@ gradio
 huggingface_hub
 einops
 scipy
-accelerate

 huggingface_hub
 einops
 scipy
+accelerate
+insightface
+onnxruntime

utils.py CHANGED Viewed

@@ -1,9 +1,15 @@
 from PIL import Image
 import torch
-import numpy as np
-import dlib
 import scipy
 def image_grid(imgs, rows, cols):
     assert len(imgs) == rows*cols
@@ -30,45 +36,37 @@ def get_generator(seed, device):
     return generator
-def get_landmark_pil(pil_image, predictor, detector):
-    """Get 68 facial landmarks as a NumPy array of shape (68, 2)."""
     img_np = np.array(pil_image.convert("RGB"))
-    dets = detector(img_np, 1)
-    if not dets:
         return None
-    # Handle mmod or frontal detector output
-    det = dets[0].rect if hasattr(dets[0], 'rect') else dets[0]
-    shape = predictor(img_np, det)
-    coords = [(pt.x, pt.y) for pt in shape.parts()]
-    return np.array(coords)
-def align_face(pil_image, predictor, detector):
     """Align a face from a PIL.Image, returning an aligned PIL.Image of size 512x512."""
-    lm = get_landmark_pil(pil_image, predictor, detector)
-    if lm is None:
         return pil_image
-    # Define landmark regions
-    lm_chin = lm[0: 17]  # left-right
-    lm_eyebrow_left = lm[17: 22]  # left-right
-    lm_eyebrow_right = lm[22: 27]  # left-right
-    lm_nose = lm[27: 31]  # top-down
-    lm_nostrils = lm[31: 36]  # top-down
-    lm_eye_left = lm[36: 42]  # left-clockwise
-    lm_eye_right = lm[42: 48]  # left-clockwise
-    lm_mouth_outer = lm[48: 60]  # left-clockwise
-    lm_mouth_inner = lm[60: 68]  # left-clockwise
-    eye_left = np.mean(lm_eye_left, axis=0)
-    eye_right = np.mean(lm_eye_right, axis=0)
     eye_avg = (eye_left + eye_right) * 0.5
     eye_to_eye = eye_right - eye_left
-    mouth_left = lm_mouth_outer[0]
-    mouth_right = lm_mouth_outer[6]
     mouth_avg = (mouth_left + mouth_right) * 0.5
     eye_to_mouth = mouth_avg - eye_avg
-    # Compute oriented crop
     x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
     x /= np.hypot(*x)
     x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
@@ -77,52 +75,64 @@ def align_face(pil_image, predictor, detector):
     quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
     qsize = np.hypot(*x) * 2
-    # Prepare image
     img = pil_image.convert("RGB")
     transform_size = 512
     output_size = 512
     enable_padding = True
-    # Shrink image for speed
     shrink = int(np.floor(qsize / output_size * 0.5))
     if shrink > 1:
-        rsize = (int(np.rint(float(img.size[0]) / shrink)), int(np.rint(float(img.size[1]) / shrink)))
         img = img.resize(rsize, Image.Resampling.LANCZOS)
         quad /= shrink
         qsize /= shrink
-    # Crop around face
     border = max(int(np.rint(qsize * 0.1)), 3)
-    crop = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))),
-            int(np.ceil(max(quad[:, 1]))))
-    crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), min(crop[2] + border, img.size[0]),
-            min(crop[3] + border, img.size[1]))
     if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]:
         img = img.crop(crop)
-        quad -= crop[0:2]
-    # Pad
-    pad = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))),
-            int(np.ceil(max(quad[:, 1]))))
-    pad = (max(-pad[0] + border, 0), max(-pad[1] + border, 0), max(pad[2] - img.size[0] + border, 0),
-            max(pad[3] - img.size[1] + border, 0))
     if enable_padding and max(pad) > border - 4:
         pad = np.maximum(pad, int(np.rint(qsize * 0.3)))
         img = np.pad(np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
         h, w, _ = img.shape
         y, x, _ = np.ogrid[:h, :w, :1]
-        mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0], np.float32(w - 1 - x) / pad[2]),
-                            1.0 - np.minimum(np.float32(y) / pad[1], np.float32(h - 1 - y) / pad[3]))
         blur = qsize * 0.02
         img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
         img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0)
         img = Image.fromarray(np.uint8(np.clip(np.rint(img), 0, 255)), 'RGB')
         quad += pad[:2]
-    # Transform image
     img = img.transform((transform_size, transform_size), Image.QUAD, (quad + 0.5).flatten(), Image.BILINEAR)
     if output_size < transform_size:
         img = img.resize((output_size, output_size), Image.Resampling.LANCZOS)
-    # Resize to final output
-    return img

+import numpy as np
 from PIL import Image
+import scipy.ndimage
+import insightface
 import torch
 import scipy
+# Initialize InsightFace model
+face_analyzer = insightface.app.FaceAnalysis(name='buffalo_l', providers=['CPUExecutionProvider'])
+face_analyzer.prepare(ctx_id=0)
 def image_grid(imgs, rows, cols):
     assert len(imgs) == rows*cols
     return generator
+def get_landmark_pil_insight(pil_image):
+    """Get 68 facial landmarks using InsightFace."""
     img_np = np.array(pil_image.convert("RGB"))
+    faces = face_analyzer.get(img_np)
+    if not faces:
         return None
+    landmarks = faces[0].kps  # shape: (5, 2) or (68, 2) depending on model
+    if landmarks.shape[0] < 68:
+        # InsightFace returns only 5 points: [left_eye, right_eye, nose, left_mouth, right_mouth]
+        left_eye, right_eye, nose, left_mouth, right_mouth = landmarks
+        # Approximate 68 landmarks (basic heuristic or fallback)
+        return np.array([
+            left_eye, right_eye, nose, left_mouth, right_mouth
+        ])
+    return landmarks
+def align_face(pil_image):
     """Align a face from a PIL.Image, returning an aligned PIL.Image of size 512x512."""
+    lm = get_landmark_pil_insight(pil_image)
+    if lm is None or lm.shape[0] < 5:
         return pil_image
+    eye_left, eye_right = lm[0], lm[1]
     eye_avg = (eye_left + eye_right) * 0.5
     eye_to_eye = eye_right - eye_left
+    mouth_left, mouth_right = lm[3], lm[4]
     mouth_avg = (mouth_left + mouth_right) * 0.5
     eye_to_mouth = mouth_avg - eye_avg
+    # The rest is your original alignment logic
     x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
     x /= np.hypot(*x)
     x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
     quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
     qsize = np.hypot(*x) * 2
     img = pil_image.convert("RGB")
     transform_size = 512
     output_size = 512
     enable_padding = True
     shrink = int(np.floor(qsize / output_size * 0.5))
     if shrink > 1:
+        rsize = (int(np.rint(img.size[0] / shrink)), int(np.rint(img.size[1] / shrink)))
         img = img.resize(rsize, Image.Resampling.LANCZOS)
         quad /= shrink
         qsize /= shrink
     border = max(int(np.rint(qsize * 0.1)), 3)
+    crop = (
+        int(np.floor(min(quad[:, 0]))),
+        int(np.floor(min(quad[:, 1]))),
+        int(np.ceil(max(quad[:, 0]))),
+        int(np.ceil(max(quad[:, 1])))
+    )
+    crop = (
+        max(crop[0] - border, 0),
+        max(crop[1] - border, 0),
+        min(crop[2] + border, img.size[0]),
+        min(crop[3] + border, img.size[1])
+    )
     if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]:
         img = img.crop(crop)
+        quad -= crop[:2]
+    pad = (
+        int(np.floor(min(quad[:, 0]))),
+        int(np.floor(min(quad[:, 1]))),
+        int(np.ceil(max(quad[:, 0]))),
+        int(np.ceil(max(quad[:, 1])))
+    )
+    pad = (
+        max(-pad[0] + border, 0),
+        max(-pad[1] + border, 0),
+        max(pad[2] - img.size[0] + border, 0),
+        max(pad[3] - img.size[1] + border, 0)
+    )
     if enable_padding and max(pad) > border - 4:
         pad = np.maximum(pad, int(np.rint(qsize * 0.3)))
         img = np.pad(np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
         h, w, _ = img.shape
         y, x, _ = np.ogrid[:h, :w, :1]
+        mask = np.maximum(
+            1.0 - np.minimum(np.float32(x) / pad[0], np.float32(w - 1 - x) / pad[2]),
+            1.0 - np.minimum(np.float32(y) / pad[1], np.float32(h - 1 - y) / pad[3])
+        )
         blur = qsize * 0.02
         img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
         img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0)
         img = Image.fromarray(np.uint8(np.clip(np.rint(img), 0, 255)), 'RGB')
         quad += pad[:2]
     img = img.transform((transform_size, transform_size), Image.QUAD, (quad + 0.5).flatten(), Image.BILINEAR)
     if output_size < transform_size:
         img = img.resize((output_size, output_size), Image.Resampling.LANCZOS)
+    return img