wufan's picture
Create app.py
4ec6e8c verified
import gradio as gr
from norm_html import normalized_html_table
import re
def clear_table_cells(pred):
# 清空 <td> 和 <th> 标签之间的内容
pred = re.sub(r'(<td[^>]*>).*?(</td>)', r'\1\2', pred, flags=re.DOTALL | re.IGNORECASE)
pred = re.sub(r'(<th[^>]*>).*?(</th>)', r'\1\2', pred, flags=re.DOTALL | re.IGNORECASE)
return pred
def add_merged_cell_lines(html_content):
"""为合并的单元格添加虚线标识"""
import re
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find('table')
if not table:
return html_content
# 获取所有行
rows = table.find_all('tr')
if not rows:
return html_content
# 为每个单元格添加虚线样式
for row_idx, row in enumerate(rows):
cells = row.find_all(['td', 'th'])
for cell in cells:
colspan = int(cell.get('colspan', 1))
rowspan = int(cell.get('rowspan', 1))
# 只为合并的单元格添加虚线
if colspan > 1 or rowspan > 1:
# 设置单元格为相对定位
style = cell.get('style', '')
if 'position: relative' not in style:
cell['style'] = f"{style}; position: relative;".strip('; ')
# 添加colspan > 1时的垂直线
if colspan > 1:
for i in range(1, colspan):
line = soup.new_tag('div')
line['style'] = f"""
position: absolute;
top: 0;
bottom: 0;
left: {i * 100 / colspan}%;
width: 0;
border-left: 2px dashed #666;
pointer-events: none;
z-index: 10;
"""
cell.append(line)
# 添加rowspan > 1时的水平线
if rowspan > 1:
for i in range(1, rowspan):
line = soup.new_tag('div')
line['style'] = f"""
position: absolute;
left: 0;
right: 0;
top: {i * 100 / rowspan}%;
height: 0;
border-top: 2px dashed #666;
pointer-events: none;
z-index: 10;
"""
cell.append(line)
return str(soup)
def show_html(pred_input, show_structure, show_merged_cell):
show_html = normalized_html_table(pred_input)
if show_structure:
show_html = clear_table_cells(show_html)
show_html = add_table_border(show_html)
if show_merged_cell:
show_html = add_merged_cell_lines(show_html)
return show_html
def add_table_border(pred_html):
pred_html = pred_html.replace("<table>", "<table class='table table-bordered' style='border-collapse: collapse; border: 3px solid #333;'>")
pred_html = pred_html.replace("<td", "<td style='border: 2px solid #333;'")
pred_html = pred_html.replace("<th", "<th style='border: 2px solid #333;'")
return pred_html
if __name__ == "__main__":
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
with gr.Row():
pred_input = gr.Textbox(label='HTML Table', placeholder='type table html code here', interactive=True)
with gr.Row():
show_structure = gr.Checkbox(label="只显示表格结构", value=True)
show_merged_cell = gr.Checkbox(label="显示合并单元格虚线", value=True)
show_html_btn = gr.Button("显示HTML")
pred_html = gr.HTML("<table><td>input HTML here.</td></table>")
show_html_btn.click(show_html, inputs=[pred_input, show_structure, show_merged_cell], outputs=[pred_html])
show_structure.change(show_html, inputs=[pred_input, show_structure, show_merged_cell], outputs=[pred_html])
show_merged_cell.change(show_html, inputs=[pred_input, show_structure, show_merged_cell], outputs=[pred_html])
demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)