from utils import encode_image, Doubao, Qwen_2_5_VL from PIL import Image import bs4 from threading import Thread import time # This dictionary can now be dynamically updated by an external script. user_instruction = { "sidebar": "Make all icons look better; fill in relevant English text; beautify the layout.", "header": "Make the Google logo look better; change the avatar color to be more appealing.", "navigation": "Please beautify the layout.", "main content": "Based on the layout, please fill in appropriate English text and beautify the image blocks." } # Prompt for each component PROMPT_DICT = { "sidebar": f"""This is a screenshot of a container. Please fill in a complete HTML and tail-wind CSS code to accurately reproduce the given container. Please note that the layout, icon style, size, and text information of all blocks need to be basically consistent with the original screenshot based on the user's additional conditions. The following is the code for filling in:
your code here
, only return the code within the
and
tags""", "header": f"""This is a screenshot of a container. Please fill in a complete HTML and tail-wind CSS code to accurately reproduce the given container. Please note that the relative position, layout, text information, and color of all blocks in the boundary box need to be basically consistent with the original screenshot based on the user's additional conditions. The following is the code for filling in:
your code here
, only return the code within the
and
tags""", "navigation": f"""This is a screenshot of a container. Please fill in a complete HTML and tail-wind CSS code to accurately reproduce the given container. Please note that the relative position, layout, text information, and color of all blocks in the boundary box need to be basically consistent with the original screenshot based on the user's additional conditions. Please use the same icons as in the original screenshot. The following is the code for filling in:
your code here
, only return the code within the
and
tags""", "main content": f"""This is a screenshot of a container. Please fill in a complete HTML and tail-wind CSS code to accurately reproduce the given container. Please note that all images displayed in the screenshot must be replaced with pure gray-400 image blocks of the same size as the corresponding images in the original screenshot, and the text information in the images does not need to be recognized. The relative position, layout, text information, and color of all blocks in the boundary box need to be basically consistent with the original screenshot based on the user's additional conditions. The following is the code for filling in:
your code here
, only return the code within the
and
tags""", } # PROMPT_sidebar = f"""这是一个container的截图。请填写一段完整的HTML和tail-wind CSS代码以准确再现给定的容器。请注意所有组块的排版、图标样式、大小、文字信息需要在用户额外条件的基础上与原始截图基本保持一致。以下是供填写的代码: #
# your code here #
# 只需返回
标签内的代码""" # PROMPT_header = f"""这是一个container的截图。请填写一段完整的HTML和tail-wind CSS代码以准确再现给定的容器。请注意所有组块在boundary box中的相对位置、排版、文字信息、颜色需要在用户额外条件的基础上与原始截图基本保持一致。以下是供填写的代码: #
# your code here #
# 只需返回
标签内的代码""" # PROMPT_navigation = f"""这是一个container的截图。请填写一段完整的HTML和tail-wind CSS代码以准确再现给定的容器。请注意所有组块的在boundary box中的相对位置、文字排版、颜色需要在用户额外条件的基础上与原始截图基本保持一致。请你直接使用原始截图中一致的图标。以下是供填写的代码: #
# your code here #
# 只需返回
标签内的代码""" # PROMPT_main_content = f"""这是一个container的截图。请填写一段完整的HTML和tail-wind CSS代码以准确再现给定的容器。截图中显示的图像务必全部用与原始截图中对应图像同样大小的纯灰色图像块替换,不需要识别图像中的文字信息。请注意所有组块在boundary box中的相对位置、排版、文字信息、颜色需要在用户额外条件的基础上与原始截图基本保持一致。以下是供填写的代码: #
# your code here #
# 只需返回
标签内的代码""" # Generate code for each component def generate_code(bbox_tree, img_path, bot): """generate code for all the leaf nodes in the bounding box tree, return a dictionary: {'id': 'code'}""" img = Image.open(img_path) code_dict = {} def _generate_code(node): if node["children"] == []: bbox = node["bbox"] # bbox is already in pixel coordinates [x1, y1, x2, y2] cropped_img = img.crop(bbox) # Select prompt based on node type if "type" in node: if node["type"] == "sidebar": prompt = PROMPT_DICT["sidebar"] elif node["type"] == "header": prompt = PROMPT_DICT["header"] elif node["type"] == "navigation": prompt = PROMPT_DICT["navigation"] elif node["type"] == "main content": prompt = PROMPT_DICT["main content"] else: print(f"Unknown component type: {node['type']}") return else: print("Node type not found") return try: code = bot.ask(prompt, encode_image(cropped_img)) code_dict[node["id"]] = code except Exception as e: print(f"Error generating code for {node.get('type', 'unknown')}: {str(e)}") code_dict[node["id"]] = f"" else: for child in node["children"]: _generate_code(child) _generate_code(bbox_tree) return code_dict # Generate code for each component in parallel # def generate_code_parallel(bbox_tree, img_path, prompt, bot): """generate code for all the leaf nodes in the bounding box tree, return a dictionary: {'id': 'code'}""" code_dict = {} t_list = [] def _generate_code_with_retry(node, max_retries=3, retry_delay=2): """Generate code with retry mechanism for rate limit errors""" try: # Create a new image instance for each thread with Image.open(img_path) as img: bbox = node["bbox"] cropped_img = img.crop(bbox) for attempt in range(max_retries): try: code = bot.ask(prompt, encode_image(cropped_img)) code_dict[node["id"]] = code return except Exception as e: if "rate_limit" in str(e).lower() and attempt < max_retries - 1: print(f"Rate limit hit, retrying in {retry_delay} seconds... (Attempt {attempt + 1}/{max_retries})") time.sleep(retry_delay) retry_delay *= 2 # Exponential backoff else: print(f"Error generating code for node {node['id']}: {str(e)}") code_dict[node["id"]] = f"" return except Exception as e: print(f"Error processing image for node {node['id']}: {str(e)}") code_dict[node["id"]] = f"" def _generate_code(node): if node["children"] == []: t = Thread(target=lambda: _generate_code_with_retry(node)) t.start() t_list.append(t) else: for child in node["children"]: _generate_code(child) _generate_code(bbox_tree) # Wait for all threads to complete for t in t_list: t.join() return code_dict # Generate HTML from the bounding box tree def generate_html(bbox_tree, output_file="output.html", img_path="data/test1.png"): """ Generates an HTML file with nested containers based on the bounding box tree. :param bbox_tree: Dictionary representing the bounding box tree. :param output_file: The name of the output HTML file. """ # HTML and CSS templates # the container class is used to create grid and position the boxes # include the tailwind css in the head tag html_template_start = """ Bounding Boxes Layout
""" html_template_end = """
""" # Function to recursively generate HTML def process_bbox(node, parent_width, parent_height, parent_left, parent_top, img): bbox = node['bbox'] children = node.get('children', []) id = node['id'] # Calculate relative positions and sizes left = (bbox[0] - parent_left) / parent_width * 100 top = (bbox[1] - parent_top) / parent_height * 100 width = (bbox[2] - bbox[0]) / parent_width * 100 height = (bbox[3] - bbox[1]) / parent_height * 100 # Start the box div html = f'''
''' if children: # If there are children, add a nested container html += '''
''' # Get the current box's width and height in pixels for child calculations current_width = bbox[2] - bbox[0] current_height = bbox[3] - bbox[1] for child in children: html += process_bbox(child, current_width, current_height, bbox[0], bbox[1], img) html += '''
''' # Close the box div html += '''
''' return html root_bbox = bbox_tree['bbox'] root_children = bbox_tree.get('children', []) root_width = root_bbox[2] root_height = root_bbox[3] root_x = root_bbox[0] root_y = root_bbox[1] html_content = html_template_start for child in root_children: html_content += process_bbox(child, root_width, root_height, root_x, root_y, img) html_content += html_template_end soup = bs4.BeautifulSoup(html_content, 'html.parser') html_content = soup.prettify() with open(output_file, 'w') as f: f.write(html_content) # Substitute the code in the html file def code_substitution(html_file, code_dict): """substitute the code in the html file""" with open(html_file, "r") as f: html = f.read() soup = bs4.BeautifulSoup(html, 'html.parser') for id, code in code_dict.items(): code = code.replace("```html", "").replace("```", "") div = soup.find(id=id) # replace the inner html of the div if div: div.append(bs4.BeautifulSoup(code, 'html.parser')) with open(html_file, "w") as f: f.write(soup.prettify()) # Main if __name__ == "__main__": import json import time from PIL import Image # Load bboxes from block_parsing.py output boxes_data = json.load(open("data/tmp/test1_bboxes.json")) img_path = "data/input/test1.png" with Image.open(img_path) as img: width, height = img.size # Create root node with actual image dimensions root = { "bbox": [0, 0, width, height], # Use actual image dimensions "children": [] } # Map region IDs to component types region_type_mapping = { "1": "sidebar", "2": "header", "3": "navigation", "4": "main content" } # Add each region as a child with its type for region in boxes_data.get("regions", []): # Convert normalized coordinates to pixel coordinates x = region["x"] * width y = region["y"] * height w = region["w"] * width h = region["h"] * height child = { "bbox": [x, y, x + w, y + h], # Convert to [x1, y1, x2, y2] format "children": [], "type": region_type_mapping.get(region["id"], "unknown") } root["children"].append(child) # Assign IDs to all nodes def assign_id(node, id): node["id"] = id for child in node.get("children", []): id = assign_id(child, id+1) return id assign_id(root, 0) # print(root) # Generate initial HTML layout generate_html(root, 'data/output/test1_layout.html') # Initialize the bot bot = Doubao("doubao_api.txt", model = "doubao-1.5-thinking-vision-pro-250428") # bot = Qwen_2_5_VL("qwen_api.txt", model="qwen2.5-vl-72b-instruct") # Generate code for each component code_dict = generate_code(root, img_path, bot) # Substitute the generated code into the HTML code_substitution('data/output/test1_layout.html', code_dict)