Spaces:
Running
Running
from smolagents.tools import Tool | |
import cv2 | |
import numpy as np | |
import os | |
import json | |
from typing import Optional | |
import logging | |
logger = logging.getLogger(__name__) | |
class DetectElementsTool(Tool): | |
name = "detect_elements" | |
description = "Detects table-like structures or text boxes in a screenshot using OpenCV." | |
inputs = { | |
"screenshot_path": {"type": "string", "nullable": True, "description": "Path to the screenshot"}, | |
"element_type": {"type": "string", "default": "table", "nullable": False, "description": "Type: 'table' or 'textbox'"} | |
} | |
output_type = "string" | |
def __init__(self, driver=None): | |
super().__init__() | |
self.driver = driver # Store driver for consistency, even if unused | |
self.is_initialized = True # No dependency on driver, so always True | |
logger.debug(f"DetectElementsTool initialized: is_initialized={self.is_initialized}") | |
def forward(self, screenshot_path: Optional[str], element_type="table"): | |
if not self.is_initialized: | |
return "Error: DetectElementsTool is not initialized" | |
try: | |
if not os.path.exists(screenshot_path): | |
return f"Screenshot not found: {screenshot_path}" | |
# Read and preprocess image | |
image = cv2.imread(screenshot_path) | |
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
blurred = cv2.GaussianBlur(gray, (5, 5), 0) | |
edges = cv2.Canny(blurred, 50, 150) | |
# Detect contours | |
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) | |
detections = [] | |
for contour in contours: | |
x, y, w, h = cv2.boundingRect(contour) | |
area = w * h | |
aspect_ratio = w / h if h > 0 else 0 | |
# Filter for tables (rectangular, large area) | |
if element_type == "table" and area > 10000 and 0.5 < aspect_ratio < 2.0: | |
detections.append({"type": "table", "bbox": [x, y, w, h]}) | |
# Filter for text boxes (narrow, horizontal) | |
elif element_type == "textbox" and area > 500 and aspect_ratio > 2.0: | |
detections.append({"type": "textbox", "bbox": [x, y, w, h]}) | |
# Draw bounding boxes on a copy of the image | |
output_path = screenshot_path.replace(".png", "_detected.png") | |
output_image = image.copy() | |
for detection in detections: | |
x, y, w, h = detection["bbox"] | |
color = (0, 255, 0) if detection["type"] == "table" else (0, 0, 255) | |
cv2.rectangle(output_image, (x, y), (x + w, y + h), color, 2) | |
cv2.imwrite(output_path, output_image) | |
return json.dumps({ | |
"detections": detections, | |
"output_image": output_path | |
}) if detections else "No elements detected" | |
except Exception as e: | |
return f"Failed to detect elements: {str(e)}" |