Spaces:

MykolaL
/

evp

Running on L4

App Files Files

xet

Community

MykolaL commited on Sep 3

Commit

fcac98a

verified ·

1 Parent(s): c866eb2

Upload app.py

Browse files

Files changed (1) hide show

app.py +28 -24

app.py CHANGED Viewed

@@ -95,34 +95,38 @@ def create_refseg_demo(model, tokenizer, device):
         with torch.no_grad():
             out = model(image_t, text)
         if isinstance(out, np.ndarray):
-            pred = torch.from_numpy(out).to(device)
         else:
-            pred = out
-        pred = pred.float()
-        if pred.dim() == 2:
-            # H×W mask -> N×C×H×W
-            pred = pred.unsqueeze(0).unsqueeze(0)
-            one_channel_mask = True
-        elif pred.dim() == 3:
-            # N×H×W -> add channel
-            pred = pred.unsqueeze(1)
-            one_channel_mask = True
-        elif pred.dim() == 4:
-            # N×C×H×W (logits) -> argmax later
-            one_channel_mask = (pred.shape[1] == 1)
-        pred = torch.nn.functional.interpolate(pred.float(), shape[2:], mode='bilinear', align_corners=True)
-        output_mask = pred.cpu().argmax(1).data.numpy().squeeze()
         alpha = 0.65
-        image[output_mask == 0] = (image[output_mask == 0]*alpha).astype(np.uint8)
-        contours, _ = cv2.findContours(output_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        cv2.drawContours(image, contours, -1, (0, 255, 0), 2)
-        return Image.fromarray(image)
     submit.click(on_submit, inputs=[input_image, input_text], outputs=refseg_image)
     examples = gr.Examples(examples=[["imgs/test_img2.jpg", "green plant"], ["imgs/test_img3.jpg", "chair"], ["imgs/test_img4.jpg", "left green plant"], ["imgs/test_img5.jpg", "man walking on foot"], ["imgs/test_img5.jpg", "the rightest camel"]],

         with torch.no_grad():
             out = model(image_t, text)
+        # --- normalize to numpy mask ---
         if isinstance(out, np.ndarray):
+            mask = out
+        elif isinstance(out, torch.Tensor):
+            pred = out.float()
+            if pred.dim() == 2:
+                mask = pred.cpu().numpy()
+            elif pred.dim() == 3:
+                # (N,H,W) → squeeze batch
+                mask = pred.squeeze(0).cpu().numpy()
+            elif pred.dim() == 4:
+                # logits (N,C,H,W) → argmax over channel
+                pred = torch.nn.functional.interpolate(pred, size=orig_shape, mode='bilinear', align_corners=True)
+                mask = pred.argmax(1).squeeze().cpu().numpy()
+            else:
+                raise RuntimeError(f"Unexpected output shape {pred.shape}")
         else:
+            raise RuntimeError(f"Unexpected output type {type(out)}")
+        # --- ensure mask is binary uint8 ---
+        if mask.dtype != np.uint8:
+            mask = (mask > 0.5).astype(np.uint8)
+        # --- overlay like your Colab code ---
         alpha = 0.65
+        overlay = image.copy()
+        overlay[mask == 0] = (overlay[mask == 0] * alpha).astype(np.uint8)
+        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        cv2.drawContours(overlay, contours, -1, (0, 255, 0), 2)
+        return Image.fromarray(overlay)
     submit.click(on_submit, inputs=[input_image, input_text], outputs=refseg_image)
     examples = gr.Examples(examples=[["imgs/test_img2.jpg", "green plant"], ["imgs/test_img3.jpg", "chair"], ["imgs/test_img4.jpg", "left green plant"], ["imgs/test_img5.jpg", "man walking on foot"], ["imgs/test_img5.jpg", "the rightest camel"]],