Simultaneous-Segmented-Depth-Prediction

Paused

App Files Files Community

Alessio Grancini commited on Feb 14

Commit

4715c42

verified ·

1 Parent(s): 557ac16

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -41

app.py CHANGED Viewed

@@ -146,7 +146,6 @@ def get_camera_matrix(depth_estimator):
         "cy": depth_estimator.cy_depth
     }
 def encode_base64_image(image_array):
     """
     Encodes a NumPy (OpenCV) image array to a base64-encoded PNG DataURL
@@ -168,78 +167,84 @@ def encode_base64_image(image_array):
     # Return a data URL
     return "data:image/png;base64," + b64_str
 @spaces.GPU
 def get_detection_data(image_data):
-    """
-    Get structured detection data with depth information, using a nested JSON + Base64 image.
-    Expects Lens Studio to send:
-      {
-        "image": {
-          "image": {
-            "data": "data:image/png;base64,<BASE64>"
-          }
-        }
-      }
-    or just a direct string.
-    """
     try:
-        # 1) Extract the nested "data" string if it's a dict
         if isinstance(image_data, dict):
-            # For the structure: {"image": {"image": {"data": "data:image/png;base64,..."}}}
             nested_dict = image_data.get("image", {}).get("image", {})
             full_data_url = nested_dict.get("data", "")
         else:
-            # If not a dict, assume it's a direct string
             full_data_url = image_data
         if not full_data_url:
             return {"error": "No base64 data found in input."}
-        # 2) Strip the "data:image/..." prefix if present
         if full_data_url.startswith("data:image"):
-            # split once on comma => ["data:image/png;base64", "<BASE64>"]
             _, b64_string = full_data_url.split(",", 1)
         else:
             b64_string = full_data_url
-        # 3) Decode base64 -> PIL -> OpenCV
         img_data = base64.b64decode(b64_string)
         img = Image.open(BytesIO(img_data))
         img = np.array(img)
         img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
-        # 4) Process image
         image = utils.resize(img)
         image_segmentation, objects_data = img_seg.predict(image)
         depthmap, depth_colormap = depth_estimator.make_prediction(image)
-        # 5) Prepare structured response
-        processed_objects = []
         for obj in objects_data:
-            cls_id, cls_name, center, mask, color = obj
-            depth_value = depth_at_center(
-                depthmap,
-                [center[0] - 10, center[1] - 10, center[0] + 10, center[1] + 10]
-            )
-            processed_objects.append({
-                "class_id": int(cls_id),
                 "class_name": cls_name,
-                "center": {"x": float(center[0]), "y": float(center[1])},
-                "depth": float(depth_value),
-                "color": [int(c) for c in color]
             })
         response = {
-            "detections": processed_objects,
-            "depth_map": encode_base64_image(depth_colormap),
-            "segmentation": encode_base64_image(image_segmentation),
-            "camera_matrix": {
-                "fx": depth_estimator.fx_depth,
-                "fy": depth_estimator.fy_depth,
-                "cx": depth_estimator.cx_depth,
-                "cy": depth_estimator.cy_depth
-            }
         }
         return response

         "cy": depth_estimator.cy_depth
     }
 def encode_base64_image(image_array):
     """
     Encodes a NumPy (OpenCV) image array to a base64-encoded PNG DataURL
     # Return a data URL
     return "data:image/png;base64," + b64_str
+def generate_image_url(image):
+    """Generate a shareable URL for an OpenCV image."""
+    success, encoded_buffer = cv2.imencode(".png", image)
+    if not success:
+        raise ValueError("Could not encode image to PNG buffer")
+    b64_str = base64.b64encode(encoded_buffer).decode("utf-8")
+    return "data:image/png;base64," + b64_str
+def get_3d_position(center, depth, camera_matrix):
+    """Project 2D center into 3D space using depth and camera matrix."""
+    cx, cy = center
+    fx, fy = camera_matrix["fx"], camera_matrix["fy"]
+    cx_d, cy_d = camera_matrix["cx"], camera_matrix["cy"]
+    x = (cx - cx_d) * depth / fx
+    y = (cy - cy_d) * depth / fy
+    z = depth
+    return [x, y, z]
+def get_bbox_from_mask(mask):
+    """Get bounding box (x1, y1, x2, y2) from a binary mask."""
+    contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    biggest_contour = max(contours, key=cv2.contourArea)
+    x, y, w, h = cv2.boundingRect(biggest_contour)
+    return x, y, x+w, y+h
 @spaces.GPU
 def get_detection_data(image_data):
     try:
         if isinstance(image_data, dict):
             nested_dict = image_data.get("image", {}).get("image", {})
             full_data_url = nested_dict.get("data", "")
         else:
             full_data_url = image_data
         if not full_data_url:
             return {"error": "No base64 data found in input."}
         if full_data_url.startswith("data:image"):
             _, b64_string = full_data_url.split(",", 1)
         else:
             b64_string = full_data_url
         img_data = base64.b64decode(b64_string)
         img = Image.open(BytesIO(img_data))
         img = np.array(img)
         img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
         image = utils.resize(img)
         image_segmentation, objects_data = img_seg.predict(image)
         depthmap, depth_colormap = depth_estimator.make_prediction(image)
+        detections = []
         for obj in objects_data:
+            cls_id, cls_name, center, mask, _ = obj
+            x1, y1, x2, y2 = get_bbox_from_mask(mask)
+            depth_value = depth_at_center(depthmap, [x1, y1, x2, y2])
+            detections.append({
+                "class_id": cls_id,
                 "class_name": cls_name,
+                "bounding_box": {
+                    "vertices": get_box_vertices([x1, y1, x2, y2])
+                },
+                "position_3d": get_3d_position(center, depth_value, get_camera_matrix(depth_estimator)),
+                "distance": depth_value
             })
         response = {
+            "detections": detections,
+            "segmentation_url": generate_image_url(image_segmentation),
+            "depth_url": generate_image_url(depth_colormap),
+            "distance_url": generate_image_url(utils.draw_depth_info(image, depthmap, objects_data)),
+            "point_cloud_url": generate_plot_url(utils.generate_obj_pcd(depthmap, objects_data)),
+            "camera_matrix": get_camera_matrix(depth_estimator),
+            "camera_position": [0, 0, 0]  # Assumed at origin based on camera intrinsics
         }
         return response