from utils import encode_image, Doubao, Qwen_2_5_VL
from PIL import Image
import bs4
from threading import Thread
import time
# This dictionary can now be dynamically updated by an external script.
user_instruction = {
"sidebar": "Make all icons look better; fill in relevant English text; beautify the layout.",
"header": "Make the Google logo look better; change the avatar color to be more appealing.",
"navigation": "Please beautify the layout.",
"main content": "Based on the layout, please fill in appropriate English text and beautify the image blocks."
}
# Prompt for each component
PROMPT_DICT = {
"sidebar": f"""This is a screenshot of a container. Please fill in a complete HTML and tail-wind CSS code to accurately reproduce the given container. Please note that the layout, icon style, size, and text information of all blocks need to be basically consistent with the original screenshot based on the user's additional conditions. The following is the code for filling in:
your code here
,
only return the code within the
and
tags""",
"header": f"""This is a screenshot of a container. Please fill in a complete HTML and tail-wind CSS code to accurately reproduce the given container. Please note that the relative position, layout, text information, and color of all blocks in the boundary box need to be basically consistent with the original screenshot based on the user's additional conditions. The following is the code for filling in:
your code here
,
only return the code within the
and
tags""",
"navigation": f"""This is a screenshot of a container. Please fill in a complete HTML and tail-wind CSS code to accurately reproduce the given container. Please note that the relative position, layout, text information, and color of all blocks in the boundary box need to be basically consistent with the original screenshot based on the user's additional conditions. Please use the same icons as in the original screenshot. The following is the code for filling in:
your code here
,
only return the code within the
and
tags""",
"main content": f"""This is a screenshot of a container. Please fill in a complete HTML and tail-wind CSS code to accurately reproduce the given container. Please note that all images displayed in the screenshot must be replaced with pure gray-400 image blocks of the same size as the corresponding images in the original screenshot, and the text information in the images does not need to be recognized. The relative position, layout, text information, and color of all blocks in the boundary box need to be basically consistent with the original screenshot based on the user's additional conditions. The following is the code for filling in:
标签内的代码"""
# Generate code for each component
def generate_code(bbox_tree, img_path, bot):
"""generate code for all the leaf nodes in the bounding box tree, return a dictionary: {'id': 'code'}"""
img = Image.open(img_path)
code_dict = {}
def _generate_code(node):
if node["children"] == []:
bbox = node["bbox"]
# bbox is already in pixel coordinates [x1, y1, x2, y2]
cropped_img = img.crop(bbox)
# Select prompt based on node type
if "type" in node:
if node["type"] == "sidebar":
prompt = PROMPT_DICT["sidebar"]
elif node["type"] == "header":
prompt = PROMPT_DICT["header"]
elif node["type"] == "navigation":
prompt = PROMPT_DICT["navigation"]
elif node["type"] == "main content":
prompt = PROMPT_DICT["main content"]
else:
print(f"Unknown component type: {node['type']}")
return
else:
print("Node type not found")
return
try:
code = bot.ask(prompt, encode_image(cropped_img))
code_dict[node["id"]] = code
except Exception as e:
print(f"Error generating code for {node.get('type', 'unknown')}: {str(e)}")
code_dict[node["id"]] = f""
else:
for child in node["children"]:
_generate_code(child)
_generate_code(bbox_tree)
return code_dict
# Generate code for each component in parallel
# def generate_code_parallel(bbox_tree, img_path, prompt, bot):
"""generate code for all the leaf nodes in the bounding box tree, return a dictionary: {'id': 'code'}"""
code_dict = {}
t_list = []
def _generate_code_with_retry(node, max_retries=3, retry_delay=2):
"""Generate code with retry mechanism for rate limit errors"""
try:
# Create a new image instance for each thread
with Image.open(img_path) as img:
bbox = node["bbox"]
cropped_img = img.crop(bbox)
for attempt in range(max_retries):
try:
code = bot.ask(prompt, encode_image(cropped_img))
code_dict[node["id"]] = code
return
except Exception as e:
if "rate_limit" in str(e).lower() and attempt < max_retries - 1:
print(f"Rate limit hit, retrying in {retry_delay} seconds... (Attempt {attempt + 1}/{max_retries})")
time.sleep(retry_delay)
retry_delay *= 2 # Exponential backoff
else:
print(f"Error generating code for node {node['id']}: {str(e)}")
code_dict[node["id"]] = f""
return
except Exception as e:
print(f"Error processing image for node {node['id']}: {str(e)}")
code_dict[node["id"]] = f""
def _generate_code(node):
if node["children"] == []:
t = Thread(target=lambda: _generate_code_with_retry(node))
t.start()
t_list.append(t)
else:
for child in node["children"]:
_generate_code(child)
_generate_code(bbox_tree)
# Wait for all threads to complete
for t in t_list:
t.join()
return code_dict
# Generate HTML from the bounding box tree
def generate_html(bbox_tree, output_file="output.html", img_path="data/test1.png"):
"""
Generates an HTML file with nested containers based on the bounding box tree.
:param bbox_tree: Dictionary representing the bounding box tree.
:param output_file: The name of the output HTML file.
"""
# HTML and CSS templates
# the container class is used to create grid and position the boxes
# include the tailwind css in the head tag
html_template_start = """
Bounding Boxes Layout
"""
html_template_end = """
"""
# Function to recursively generate HTML
def process_bbox(node, parent_width, parent_height, parent_left, parent_top, img):
bbox = node['bbox']
children = node.get('children', [])
id = node['id']
# Calculate relative positions and sizes
left = (bbox[0] - parent_left) / parent_width * 100
top = (bbox[1] - parent_top) / parent_height * 100
width = (bbox[2] - bbox[0]) / parent_width * 100
height = (bbox[3] - bbox[1]) / parent_height * 100
# Start the box div
html = f'''
'''
if children:
# If there are children, add a nested container
html += '''
'''
# Get the current box's width and height in pixels for child calculations
current_width = bbox[2] - bbox[0]
current_height = bbox[3] - bbox[1]
for child in children:
html += process_bbox(child, current_width, current_height, bbox[0], bbox[1], img)
html += '''
'''
# Close the box div
html += '''
'''
return html
root_bbox = bbox_tree['bbox']
root_children = bbox_tree.get('children', [])
root_width = root_bbox[2]
root_height = root_bbox[3]
root_x = root_bbox[0]
root_y = root_bbox[1]
html_content = html_template_start
for child in root_children:
html_content += process_bbox(child, root_width, root_height, root_x, root_y, img)
html_content += html_template_end
soup = bs4.BeautifulSoup(html_content, 'html.parser')
html_content = soup.prettify()
with open(output_file, 'w') as f:
f.write(html_content)
# Substitute the code in the html file
def code_substitution(html_file, code_dict):
"""substitute the code in the html file"""
with open(html_file, "r") as f:
html = f.read()
soup = bs4.BeautifulSoup(html, 'html.parser')
for id, code in code_dict.items():
code = code.replace("```html", "").replace("```", "")
div = soup.find(id=id)
# replace the inner html of the div
if div:
div.append(bs4.BeautifulSoup(code, 'html.parser'))
with open(html_file, "w") as f:
f.write(soup.prettify())
# Main
if __name__ == "__main__":
import json
import time
from PIL import Image
# Load bboxes from block_parsing.py output
boxes_data = json.load(open("data/tmp/test1_bboxes.json"))
img_path = "data/input/test1.png"
with Image.open(img_path) as img:
width, height = img.size
# Create root node with actual image dimensions
root = {
"bbox": [0, 0, width, height], # Use actual image dimensions
"children": []
}
# Map region IDs to component types
region_type_mapping = {
"1": "sidebar",
"2": "header",
"3": "navigation",
"4": "main content"
}
# Add each region as a child with its type
for region in boxes_data.get("regions", []):
# Convert normalized coordinates to pixel coordinates
x = region["x"] * width
y = region["y"] * height
w = region["w"] * width
h = region["h"] * height
child = {
"bbox": [x, y, x + w, y + h], # Convert to [x1, y1, x2, y2] format
"children": [],
"type": region_type_mapping.get(region["id"], "unknown")
}
root["children"].append(child)
# Assign IDs to all nodes
def assign_id(node, id):
node["id"] = id
for child in node.get("children", []):
id = assign_id(child, id+1)
return id
assign_id(root, 0)
# print(root)
# Generate initial HTML layout
generate_html(root, 'data/output/test1_layout.html')
# Initialize the bot
bot = Doubao("doubao_api.txt", model = "doubao-1.5-thinking-vision-pro-250428")
# bot = Qwen_2_5_VL("qwen_api.txt", model="qwen2.5-vl-72b-instruct")
# Generate code for each component
code_dict = generate_code(root, img_path, bot)
# Substitute the generated code into the HTML
code_substitution('data/output/test1_layout.html', code_dict)