import re
import html
import uuid
import subprocess
import unicodedata
from bs4 import BeautifulSoup
def normalized_html_table(text):
def process_table_html(md_i):
"""
pred_md format edit
"""
def process_table_html(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
th_tags = soup.find_all('th')
for th in th_tags:
th.name = 'td'
thead_tags = soup.find_all('thead')
for thead in thead_tags:
thead.unwrap() # unwrap()会移除标签但保留其内容
math_tags = soup.find_all('math')
for math_tag in math_tags:
alttext = math_tag.get('alttext', '')
alttext = f'${alttext}$'
if alttext:
math_tag.replace_with(alttext)
span_tags = soup.find_all('span')
for span in span_tags:
span.unwrap()
return str(soup)
table_res=''
table_res_no_space=''
if '
'
tables = re.findall(pattern, table_res, re.DOTALL | re.IGNORECASE)
table_res = ''.join(tables)
# table_res = re.sub('','',table_res)
table_res = re.sub('( style=".*?")', "", table_res)
table_res = re.sub('( style=".*?")', "", table_res)
table_res = re.sub('( height=".*?")', "", table_res)
table_res = re.sub('( width=".*?")', "", table_res)
table_res = re.sub('( colwidth=".*?")', "", table_res)
table_res = re.sub('( colheight=".*?")', "", table_res)
table_res = re.sub('( rowwidth=".*?")', "", table_res)
table_res = re.sub('( rowheight=".*?")', "", table_res)
table_res = re.sub('( align=".*?")', "", table_res)
table_res = re.sub('( class=".*?")', "", table_res)
table_res = re.sub('( rowspan="1")', "", table_res)
table_res = re.sub('( colspan="1")', "", table_res)
table_res = re.sub('?tbody>',"",table_res)
table_res = re.sub(r'\s+', " ", table_res)
table_res_no_space = '' + table_res.replace(' ','') + '
'
# table_res_no_space = re.sub(' (style=".*?")',"",table_res_no_space)
# table_res_no_space = re.sub(r'[ ]', " ", table_res_no_space)
table_res_no_space = re.sub('colspan="', ' colspan="', table_res_no_space)
table_res_no_space = re.sub('rowspan="', ' rowspan="', table_res_no_space)
table_res_no_space = re.sub('border="', ' border="', table_res_no_space)
table_res = ''
# table_flow.append(table_res)
# table_flow_no_space.append(table_res_no_space)
return table_res, table_res_no_space
def clean_table(input_str,flag=True):
if flag:
input_str = input_str.replace('', '').replace('', '')
input_str = input_str.replace('', '').replace('', '')
input_str = input_str.replace('', '').replace('', '')
input_str = input_str.replace('', '').replace('
', '')
input_str = input_str.replace('', '').replace('
', '')
input_str = input_str.replace('', '')
input_str = re.sub('.*?','',input_str)
return input_str
def process_formula(input_str):
# 处理行内数学公式
inline_pattern = r'
'
input_str = re.sub(inline_pattern, r'\(\1\)', input_str)
# 处理块级数学公式(如果有的话)
block_pattern = r'
'
input_str = re.sub(block_pattern, r'\[\1\]', input_str)
return input_str
def process_uline(input_str):
return input_str.replace('<u>', '').replace('</u>', '')
text = process_formula(text)
text = process_uline(text)
norm_text, _ = process_table_html(text)
norm_text = clean_table(norm_text)
return norm_text.replace('> ', '>').replace(" ", "")