File size: 11,617 Bytes
a383d0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
import os
import cv2
import json
from utils import Doubao, encode_image, image_mask

DEFAULT_IMAGE_PATH = "data/input/test1.png"
DEFAULT_API_PATH = "doubao_api.txt"
PROMPT_LIST = [
    ("header", "Please output the minimum bounding box of the header. Please output the bounding box in the format of <bbox>x1 y1 x2 y2</bbox>. Avoid the blank space in the header."),
    ("sidebar", "Please output the minimum bounding box of the sidebar. Please output the bounding box in the format of <bbox>x1 y1 x2 y2</bbox>. Avoid meaningless blank space in the sidebar."),
    ("navigation", "Please output the minimum bounding box of the navigation. Please output the bounding box in the format of <bbox>x1 y1 x2 y2</bbox>. Avoid the blank space in the navigation."),
    ("main content", "Please output the minimum bounding box of the main content. Please output the bounding box in the format of <bbox>x1 y1 x2 y2</bbox>. Avoid the blank space in the main content."),
]
PROMPT_MERGE = "Return the bounding boxes of the sidebar, main content, header, and navigation in this webpage screenshot. Please only return the corresponding bounding boxes. Note: 1. The areas should not overlap; 2. All text information and other content should be framed inside; 3. Try to keep it compact without leaving a lot of blank space; 4. Output a label and the corresponding bounding box for each line."
BBOX_TAG_START = "<bbox>"
BBOX_TAG_END = "</bbox>"

# PROMPT_sidebar = "框出网页中的sidebar的位置,请你只返回对应的bounding box。"
# PROMPT_header = "框出网页中的header的位置,请你只返回对应的bounding box。"
# PROMPT_navigation = "框出网页中的navigation的位置,请你只返回对应的bounding box。"
# PROMPT_main_content = "框出网页中的main content的位置,请你只返回对应的bounding box。"

# simple version of bbox parsing
def parse_bboxes(bbox_input: str, image_path: str) -> dict[str, tuple[int, int, int, int]]:
    """Parse bounding box string to dictionary of named coordinate tuples"""
    bboxes = {}
    # print("Raw bbox input:", bbox_input) # Debug print

    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Failed to read image {image_path}")
        return bboxes
    h, w = image.shape[:2]
    
    try:
        components = bbox_input.strip().split('\n')
        # print("Split components:", components)  # Debug print
        
        for component in components:
            component = component.strip()
            if not component:
                continue
                
            if ':' in component:
                name, bbox_str = component.split(':', 1)
            else:
                bbox_str = component
                if 'sidebar' in component.lower():
                    name = 'sidebar'
                elif 'header' in component.lower():
                    name = 'header'
                elif 'navigation' in component.lower():
                    name = 'navigation'
                elif 'main content' in component.lower():
                    name = 'main content'
                else:
                    name = 'unknown'
            
            name = name.strip().lower()
            bbox_str = bbox_str.strip()
            
            # print(f"Processing component: {name}, bbox_str: {bbox_str}")  # Debug print
            
            if BBOX_TAG_START in bbox_str and BBOX_TAG_END in bbox_str:
                start_idx = bbox_str.find(BBOX_TAG_START) + len(BBOX_TAG_START)
                end_idx = bbox_str.find(BBOX_TAG_END)
                coords_str = bbox_str[start_idx:end_idx].strip()
                
                try:
                    norm_coords = list(map(int, coords_str.split()))
                    if len(norm_coords) == 4:
                        x_min = int(norm_coords[0])
                        y_min = int(norm_coords[1])
                        x_max = int(norm_coords[2])
                        y_max = int(norm_coords[3])
                        bboxes[name] = (x_min, y_min, x_max, y_max)
                        print(f"Successfully parsed {name}: {bboxes[name]}")
                    else:
                        print(f"Invalid number of coordinates for {name}: {norm_coords}")
                except ValueError as e:
                    print(f"Failed to parse coordinates for {name}: {e}")
            else:
                print(f"No bbox tags found in: {bbox_str}")
                
    except Exception as e:
        print(f"Coordinate parsing failed: {str(e)}")
        import traceback
        traceback.print_exc()
        
    print("Final parsed bboxes:", bboxes)
    return bboxes

def draw_bboxes(image_path: str, bboxes: dict[str, tuple[int, int, int, int]]) -> str:
    """Draw bounding boxes on image and save with different colors for each component"""
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Failed to read image {image_path}")
        return ""    
    
    h, w = image.shape[:2]
    colors = {
        'sidebar': (0, 0, 255),  # Red
        'header': (0, 255, 0),  # Green
        'navigation': (255, 0, 0),  # Blue
        'main content': (255, 255, 0),  # Cyan
        'unknown': (0, 0, 0),  # Black
    }
    
    for component, norm_bbox in bboxes.items():
        # Convert normalized coordinates to pixel coordinates for drawing
        x_min = int(norm_bbox[0] * w / 1000)
        y_min = int(norm_bbox[1] * h / 1000)
        x_max = int(norm_bbox[2] * w / 1000)
        y_max = int(norm_bbox[3] * h / 1000)
        
        color = colors.get(component.lower(), (0, 0, 255))
        cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, 3)
        
        # Add label
        cv2.putText(image, component, (x_min, y_min - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)
    
    # Output directory
    output_dir = "data/tmp"
    os.makedirs(output_dir, exist_ok=True)
    
    # Get the original filename without path
    original_filename = os.path.basename(image_path)
    output_path = os.path.join(output_dir, os.path.splitext(original_filename)[0] + "_with_bboxes.png")
    
    if cv2.imwrite(output_path, image):
        print(f"Successfully saved annotated image: {output_path}")
        return output_path
    print("Error: Failed to save image")
    return ""

def save_bboxes_to_json(bboxes: dict[str, tuple[int, int, int, int]], image_path: str) -> str:
    """Save bounding boxes information to a JSON file"""
    # Output directory
    output_dir = "data/tmp"
    os.makedirs(output_dir, exist_ok=True)
    
    original_filename = os.path.basename(image_path)
    json_path = os.path.join(output_dir, os.path.splitext(original_filename)[0] + "_bboxes.json")
    
    bboxes_dict = {k: list(v) for k, v in bboxes.items()}
    
    try:
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(bboxes_dict, f, indent=4, ensure_ascii=False)
        print(f"Successfully saved bbox information to: {json_path}")
        return json_path
    except Exception as e:
        print(f"Error saving JSON file: {str(e)}")
        return ""

# sequential version of bbox parsing: Using recursive detection with mask
def sequential_component_detection(image_path: str, api_path: str) -> dict[str, tuple[int, int, int, int]]:
    """
    Sequential processing flow: detect each component in turn, mask the image after each detection
    """
    bboxes = {}
    current_image_path = image_path
    ark_client = Doubao(api_path)
    
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Failed to read image {image_path}")
        return bboxes
    h, w = image.shape[:2]
    
    for i, (component_name, prompt) in enumerate(PROMPT_LIST):
        print(f"\n=== Processing {component_name} (Step {i+1}/{len(PROMPT_LIST)}) ===")

        base64_image = encode_image(current_image_path)
        if not base64_image:
            print(f"Error: Failed to encode image for {component_name}")
            continue

        print(f"Sending prompt for {component_name}...")
        bbox_content = ark_client.ask(prompt, base64_image)
        print(f"Model response for {component_name}:")
        print(bbox_content)
        
        norm_bbox = parse_single_bbox(bbox_content, component_name)
        if norm_bbox:
            bboxes[component_name] = norm_bbox
            print(f"Successfully detected {component_name}: {norm_bbox}")
            
            masked_image = image_mask(current_image_path, norm_bbox)
            
            temp_image_path = f"data/temp_{component_name}_masked.png"
            masked_image.save(temp_image_path)
            current_image_path = temp_image_path
            
            print(f"Created masked image for next step: {temp_image_path}")
        else:
            print(f"Failed to detect {component_name}")
    
    return bboxes

def parse_single_bbox(bbox_input: str, component_name: str) -> tuple[int, int, int, int]:
    """
    Parses a single component's bbox string and returns normalized coordinates.
    """
    print(f"Parsing bbox for {component_name}: {bbox_input}")
    
    try:
        if BBOX_TAG_START in bbox_input and BBOX_TAG_END in bbox_input:
            start_idx = bbox_input.find(BBOX_TAG_START) + len(BBOX_TAG_START)
            end_idx = bbox_input.find(BBOX_TAG_END)
            coords_str = bbox_input[start_idx:end_idx].strip()
            
            norm_coords = list(map(int, coords_str.split()))
            if len(norm_coords) == 4:
                return tuple(norm_coords)
            else:
                print(f"Invalid number of coordinates for {component_name}: {norm_coords}")
        else:
            print(f"No bbox tags found in response for {component_name}")
    except Exception as e:
        print(f"Failed to parse bbox for {component_name}: {e}")
    
    return None

def main_content_processing(bboxes: dict[str, tuple[int, int, int, int]], image_path: str) -> dict[str, tuple[int, int, int, int]]:
    """devide the main content into several parts"""
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Failed to read image {image_path}")
        return
    h, w = image.shape[:2]
    for component, bbox in bboxes.items():
        bboxes[component] = (
            int(bbox[0] * w / 1000),
            int(bbox[1] * h / 1000),
            int(bbox[2] * w / 1000),
            int(bbox[3] * h / 1000))
    
    
if __name__ == "__main__":
    image_path = DEFAULT_IMAGE_PATH
    api_path = DEFAULT_API_PATH

    print("=== Starting Simple Component Detection ===")
    print(f"Input image: {image_path}")
    print(f"API path: {api_path}")
    client = Doubao(api_path)
    bbox_content = client.ask(PROMPT_MERGE, encode_image(image_path))
    print(f"Model response: {bbox_content}\n")
    bboxes = parse_bboxes(bbox_content, image_path)

    # print("=== Starting Sequential Component Detection ===")
    # print(f"Input image: {image_path}")
    # print(f"API path: {api_path}")
    # bboxes = sequential_component_detection(image_path, api_path)
    
    if bboxes:
        print(f"\n=== Detection Complete ===")
        print(f"Found bounding boxes for components: {list(bboxes.keys())}")
        print(f"Total components detected: {len(bboxes)}")
        
        json_path = save_bboxes_to_json(bboxes, image_path)
        draw_bboxes(image_path, bboxes)
        
        print(f"\n=== Results ===")
        for component, bbox in bboxes.items():
            print(f"{component}: {bbox}")
    else:
        print("\nNo valid bounding box coordinates found")
        exit(1)