Insert-Anything

Paused

App Files Files Community

isat commited on 3 days ago

Commit

12b4ecb

verified ·

1 Parent(s): ecf3916

Update app.py

Browse files

Acelerating inference

Files changed (1) hide show

app.py +138 -45

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py — storage-safe + HF Hub friendly + SAM import guard
 import os
@@ -19,12 +19,15 @@ os.environ["OMP_NUM_THREADS"] = omp_val  # must be a positive integer string
 os.environ.setdefault("HF_HOME", "/data/.huggingface")
 os.environ.setdefault("HF_HUB_CACHE", "/data/.huggingface/hub")
 os.environ.setdefault("HF_DATASETS_CACHE", "/data/.huggingface/datasets")
-# (TRANSFORMERS_CACHE is deprecated; rely on HF_HOME)  # https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache
 # Disable Xet path, enable fast transfer
 os.environ.setdefault("HF_HUB_DISABLE_XET", "1")
 os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
 # ---------- NOW safe to import heavy libs ----------
 import sys
 import cv2
@@ -33,12 +36,37 @@ import torch
 import gradio as gr
 from PIL import Image, ImageFilter, ImageDraw
 try:
     torch.set_num_threads(int(omp_val))
     torch.set_num_interop_threads(1)
 except Exception:
     pass
 # ---------- HUB IMPORTS ----------
 from huggingface_hub import snapshot_download, hf_hub_download
 from diffusers import FluxFillPipeline, FluxPriorReduxPipeline
@@ -92,14 +120,13 @@ GROUNDING_DINO_CONFIG_PATH = "./GroundingDINO_SwinB.cfg.py"
 GROUNDING_DINO_CHECKPOINT_PATH = os.path.join(CKPT_DIR, "groundingdino_swinb_cogcoor.pth")
 # Segment-Anything checkpoint
-SAM_ENCODER_VERSION = "vit_h"
 SAM_CHECKPOINT_PATH = os.path.join(CKPT_DIR, "sam_vit_h_4b8939.pth")
 # ---------- AUTH TOKEN ----------
 hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
 # ---------- DOWNLOAD CHECKPOINTS (single files) ----------
-# Use hf_hub_download for single files, which returns a cached path. Keep files under /data.  # https://huggingface.co/docs/huggingface_hub/en/guides/download
 if not os.path.exists(GROUNDING_DINO_CHECKPOINT_PATH):
     g_dino_file = hf_hub_download(
         repo_id="ShilongLiu/GroundingDINO",
@@ -164,13 +191,14 @@ groundingdino_model = load_model(
     device="cuda"
 )
-# SAM + Predictor (registry API from official SAM)  # https://github.com/facebookresearch/segment-anything
 sam = sam_model_registry[SAM_ENCODER_VERSION](checkpoint=SAM_CHECKPOINT_PATH)
 sam.to(device="cuda")
 sam_predictor = SamPredictor(sam)
 # Diffusers (Flux)
-dtype = torch.bfloat16
 size = (768, 768)
 pipe = FluxFillPipeline.from_pretrained(
@@ -178,17 +206,57 @@ pipe = FluxFillPipeline.from_pretrained(
     torch_dtype=dtype
 ).to("cuda")
 pipe.load_lora_weights(
     os.path.join(LORA_DIR, "20250321_steps5000_pytorch_lora_weights.safetensors")
 )
 redux = FluxPriorReduxPipeline.from_pretrained(REDUX_DIR).to(dtype=dtype).to("cuda")
-# ---------- APP LOGIC ----------
 def transform_image(image_pil):
     transform = T.Compose(
         [
-            T.RandomResize([800], max_size=1333),
             T.ToTensor(),
             T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
         ]
@@ -196,31 +264,42 @@ def transform_image(image_pil):
     image, _ = transform(image_pil, None)  # 3, h, w
     return image
 def get_grounding_output(model, image, caption, box_threshold=0.25, text_threshold=0.25, with_logits=True):
     caption = caption.lower().strip()
     if not caption.endswith("."):
         caption = caption + "."
-    with torch.no_grad():
         outputs = model(image[None], captions=[caption])
-    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
-    boxes = outputs["pred_boxes"].cpu()[0]              # (nq, 4)
     # filter output
     filt_mask = logits.max(dim=1)[0] > box_threshold
     logits_filt = logits[filt_mask]
     boxes_filt = boxes[filt_mask]
-    # get phrase
     tokenlizer = model.tokenizer
     tokenized = tokenlizer(caption)
-    pred_phrases, scores = [], []
-    for logit, box in zip(logits_filt, boxes_filt):
         pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
-        pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})" if with_logits else pred_phrase)
-        scores.append(logit.max().item())
-    return boxes_filt, torch.Tensor(scores), pred_phrases
 def get_mask(image, label):
     global groundingdino_model, sam_predictor
@@ -236,27 +315,25 @@ def get_mask(image, label):
         boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
         boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
         boxes_filt[i][2:] += boxes_filt[i][:2]
-    boxes_filt = boxes_filt.cpu()
-    nms_idx = torchvision.ops.nms(boxes_filt, scores, 0.8).numpy().tolist()
-    boxes_filt = boxes_filt[nms_idx]
     image_np = np.array(image_pil)
     sam_predictor.set_image(image_np)
     transformed_boxes = sam_predictor.transform.apply_boxes_torch(
         boxes_filt, image_np.shape[:2]
     ).to("cuda")
-    masks, _, _ = sam_predictor.predict_torch(
-        point_coords=None,
-        point_labels=None,
-        boxes=transformed_boxes,
-        multimask_output=False,
-    )
-    result_mask = masks[0][0].cpu().numpy()
     return Image.fromarray(result_mask)
 def create_highlighted_mask(image_np, mask_np, alpha=0.5, gray_value=128):
     if mask_np.max() <= 1.0:
         mask_np = (mask_np * 255).astype(np.uint8)
@@ -267,6 +344,15 @@ def create_highlighted_mask(image_np, mask_np, alpha=0.5, gray_value=128):
     result[mask_bool] = (1 - alpha) * image_float[mask_bool] + alpha * gray_overlay[mask_bool]
     return result.astype(np.uint8)
 # ---------- EXAMPLES ----------
 ref_dir = './examples/ref_image'
@@ -334,9 +420,8 @@ def run_local(base_image, base_mask, reference_image, ref_mask, seed, base_mask_
     masked_ref_image = pad_to_square(masked_ref_image, pad_value=255, random=False)
-    kernel = np.ones((7, 7), np.uint8)
     iterations = 2
-    tar_mask = cv2.dilate(tar_mask, kernel, iterations=iterations)
     # zoom in
     tar_box_yyxx = get_bbox_from_mask(tar_mask)
@@ -355,8 +440,10 @@ def run_local(base_image, base_mask, reference_image, ref_mask, seed, base_mask_
     tar_mask = pad_to_square(tar_mask, pad_value=0)
     tar_mask = cv2.resize(tar_mask, size)
     masked_ref_image = cv2.resize(masked_ref_image.astype(np.uint8), size).astype(np.uint8)
-    pipe_prior_output = redux(Image.fromarray(masked_ref_image))
     tar_image = pad_to_square(tar_image, pad_value=255)
     H2, W2 = tar_image.shape[0], tar_image.shape[1]
@@ -374,16 +461,22 @@ def run_local(base_image, base_mask, reference_image, ref_mask, seed, base_mask_
     mask_diptych[mask_diptych == 1] = 255
     mask_diptych = Image.fromarray(mask_diptych)
-    generator = torch.Generator("cuda").manual_seed(seed)
-    edited_image = pipe(
-        image=diptych_ref_tar,
-        mask_image=mask_diptych,
-        height=mask_diptych.size[1],
-        width=mask_diptych.size[0],
-        max_sequence_length=512,
-        generator=generator,
-        **pipe_prior_output,
-    ).images[0]
     width, height = edited_image.size
     left = width // 2
@@ -471,4 +564,4 @@ with gr.Blocks() as demo:
         inputs=[base_image, base_mask, ref_image, ref_mask, seed, base_mask_option, ref_mask_option, text_prompt],
         outputs=[baseline_gallery]
     )
-demo.launch()

+# app.py — storage-safe + HF Hub friendly + speed-optimized
 import os
 os.environ.setdefault("HF_HOME", "/data/.huggingface")
 os.environ.setdefault("HF_HUB_CACHE", "/data/.huggingface/hub")
 os.environ.setdefault("HF_DATASETS_CACHE", "/data/.huggingface/datasets")
+# (TRANSFORMERS_CACHE is deprecated; rely on HF_HOME)
 # Disable Xet path, enable fast transfer
 os.environ.setdefault("HF_HUB_DISABLE_XET", "1")
 os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
+# Faster + smoother CUDA memory behavior
+os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
 # ---------- NOW safe to import heavy libs ----------
 import sys
 import cv2
 import gradio as gr
 from PIL import Image, ImageFilter, ImageDraw
+# Global torch speed knobs
 try:
     torch.set_num_threads(int(omp_val))
     torch.set_num_interop_threads(1)
 except Exception:
     pass
+# Use TF32 when available (Ampere/Ada)
+try:
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+except Exception:
+    pass
+try:
+    # PyTorch 2.x matmul precision hint
+    torch.set_float32_matmul_precision("high")
+except Exception:
+    pass
+# Pick fastest cudnn convs once shapes are known
+try:
+    torch.backends.cudnn.benchmark = True
+except Exception:
+    pass
+# No autograd for inference-only app
+torch.set_grad_enabled(False)
+# SDPA availability flag
+USE_SDPA = hasattr(torch.nn.functional, "scaled_dot_product_attention")
 # ---------- HUB IMPORTS ----------
 from huggingface_hub import snapshot_download, hf_hub_download
 from diffusers import FluxFillPipeline, FluxPriorReduxPipeline
 GROUNDING_DINO_CHECKPOINT_PATH = os.path.join(CKPT_DIR, "groundingdino_swinb_cogcoor.pth")
 # Segment-Anything checkpoint
+SAM_ENCODER_VERSION = "vit_h"  # consider "vit_l" or "vit_b" for more speed
 SAM_CHECKPOINT_PATH = os.path.join(CKPT_DIR, "sam_vit_h_4b8939.pth")
 # ---------- AUTH TOKEN ----------
 hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
 # ---------- DOWNLOAD CHECKPOINTS (single files) ----------
 if not os.path.exists(GROUNDING_DINO_CHECKPOINT_PATH):
     g_dino_file = hf_hub_download(
         repo_id="ShilongLiu/GroundingDINO",
     device="cuda"
 )
+# SAM + Predictor (registry API from official SAM)
 sam = sam_model_registry[SAM_ENCODER_VERSION](checkpoint=SAM_CHECKPOINT_PATH)
 sam.to(device="cuda")
 sam_predictor = SamPredictor(sam)
 # Diffusers (Flux)
+# Prefer float16 for speed; change to bfloat16 if you hit NaNs on your GPU/drivers.
+dtype = torch.float16
 size = (768, 768)
 pipe = FluxFillPipeline.from_pretrained(
     torch_dtype=dtype
 ).to("cuda")
+# Load LoRA
 pipe.load_lora_weights(
     os.path.join(LORA_DIR, "20250321_steps5000_pytorch_lora_weights.safetensors")
 )
+# Speed features
+try:
+    if USE_SDPA and hasattr(pipe, "enable_sdpa"):
+        pipe.enable_sdpa()
+    elif hasattr(pipe, "enable_xformers_memory_efficient_attention"):
+        pipe.enable_xformers_memory_efficient_attention()
+except Exception:
+    pass
+try:
+    pipe.enable_vae_tiling()
+except Exception:
+    pass
+# Compile hot paths (PyTorch 2.0+)
+try:
+    pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+    if hasattr(pipe.vae, "decode"):
+        pipe.vae.decode = torch.compile(pipe.vae.decode, mode="reduce-overhead")
+except Exception:
+    pass
+# Disable progress bars for tiny perf win
+try:
+    pipe.set_progress_bar_config(disable=True)
+except Exception:
+    pass
 redux = FluxPriorReduxPipeline.from_pretrained(REDUX_DIR).to(dtype=dtype).to("cuda")
+try:
+    if USE_SDPA and hasattr(redux, "enable_sdpa"):
+        redux.enable_sdpa()
+except Exception:
+    pass
+try:
+    if hasattr(redux, "image_encoder"):
+        redux.image_encoder = torch.compile(redux.image_encoder, mode="reduce-overhead")
+except Exception:
+    pass
+# ---------- GLOBAL UTILS ----------
 def transform_image(image_pil):
+    # Smaller resize for faster DINO (was 800/max 1333)
     transform = T.Compose(
         [
+            T.RandomResize([640], max_size=1024),
             T.ToTensor(),
             T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
         ]
     image, _ = transform(image_pil, None)  # 3, h, w
     return image
 def get_grounding_output(model, image, caption, box_threshold=0.25, text_threshold=0.25, with_logits=True):
     caption = caption.lower().strip()
     if not caption.endswith("."):
         caption = caption + "."
+    device = next(model.parameters()).device
+    image = image.to(device, non_blocking=True)
+    # DINO forward in fp16 for speed
+    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):
         outputs = model(image[None], captions=[caption])
+    logits = outputs["pred_logits"].sigmoid()[0]  # (nq, 256) CUDA
+    boxes = outputs["pred_boxes"][0]              # (nq, 4)   CUDA
     # filter output
     filt_mask = logits.max(dim=1)[0] > box_threshold
     logits_filt = logits[filt_mask]
     boxes_filt = boxes[filt_mask]
+    # scores for NMS
+    scores = logits_filt.max(dim=1).values
+    # NMS on GPU
+    nms_idx = torchvision.ops.nms(boxes_filt, scores, 0.8)
+    # Move minimal tensors to CPU for tokenizer phrase mapping
+    boxes_filt_cpu = boxes_filt[nms_idx].detach().cpu()
+    scores_cpu = scores[nms_idx].detach().cpu()
     tokenlizer = model.tokenizer
     tokenized = tokenlizer(caption)
+    pred_phrases = []
+    for logit in logits_filt[nms_idx].detach().cpu():
         pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+        pred_phrases.append(pred_phrase + f"({float(logit.max()):.2f})" if with_logits else pred_phrase)
+    return boxes_filt_cpu, scores_cpu, pred_phrases
 def get_mask(image, label):
     global groundingdino_model, sam_predictor
         boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
         boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
         boxes_filt[i][2:] += boxes_filt[i][:2]
+    # keep CPU for transform, then CUDA for SAM
     image_np = np.array(image_pil)
     sam_predictor.set_image(image_np)
     transformed_boxes = sam_predictor.transform.apply_boxes_torch(
         boxes_filt, image_np.shape[:2]
     ).to("cuda")
+    # SAM forward (fp16 autocast for speed; switch to fp32 if masks degrade)
+    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16):
+        masks, _, _ = sam_predictor.predict_torch(
+            point_coords=None,
+            point_labels=None,
+            boxes=transformed_boxes,
+            multimask_output=False,
+        )
+    result_mask = masks[0][0].detach().cpu().numpy()
     return Image.fromarray(result_mask)
 def create_highlighted_mask(image_np, mask_np, alpha=0.5, gray_value=128):
     if mask_np.max() <= 1.0:
         mask_np = (mask_np * 255).astype(np.uint8)
     result[mask_bool] = (1 - alpha) * image_float[mask_bool] + alpha * gray_overlay[mask_bool]
     return result.astype(np.uint8)
+# Pre-allocated kernel to avoid repeated allocs
+KERNEL_7x7 = np.ones((7, 7), np.uint8)
+# Reusable CUDA generator (seedable)
+GLOBAL_GEN = torch.Generator(device="cuda")
+def make_gen(seed):
+    if seed is None or seed < 0:
+        return GLOBAL_GEN
+    return torch.Generator(device="cuda").manual_seed(int(seed))
 # ---------- EXAMPLES ----------
 ref_dir = './examples/ref_image'
     masked_ref_image = pad_to_square(masked_ref_image, pad_value=255, random=False)
     iterations = 2
+    tar_mask = cv2.dilate(tar_mask, KERNEL_7x7, iterations=iterations)
     # zoom in
     tar_box_yyxx = get_bbox_from_mask(tar_mask)
     tar_mask = pad_to_square(tar_mask, pad_value=0)
     tar_mask = cv2.resize(tar_mask, size)
+    # --- Redux (prior) ---
     masked_ref_image = cv2.resize(masked_ref_image.astype(np.uint8), size).astype(np.uint8)
+    with torch.inference_mode(), torch.autocast("cuda", dtype=dtype):
+        pipe_prior_output = redux(Image.fromarray(masked_ref_image))
     tar_image = pad_to_square(tar_image, pad_value=255)
     H2, W2 = tar_image.shape[0], tar_image.shape[1]
     mask_diptych[mask_diptych == 1] = 255
     mask_diptych = Image.fromarray(mask_diptych)
+    # Reusable CUDA generator
+    generator = make_gen(seed)
+    # --- Flux Fill ---
+    with torch.inference_mode(), torch.autocast("cuda", dtype=dtype):
+        edited_image = pipe(
+            image=diptych_ref_tar,
+            mask_image=mask_diptych,
+            height=mask_diptych.size[1],
+            width=mask_diptych.size[0],
+            max_sequence_length=512,
+            generator=generator,
+            num_inference_steps=18,   # tune 12–24 for quality/speed tradeoff
+            guidance_scale=3.5,       # lower often faster and still good
+            **pipe_prior_output,
+        ).images[0]
     width, height = edited_image.size
     left = width // 2
         inputs=[base_image, base_mask, ref_image, ref_mask, seed, base_mask_option, ref_mask_option, text_prompt],
         outputs=[baseline_gallery]
     )
+demo.launch()