import re import html import uuid import subprocess import unicodedata from bs4 import BeautifulSoup def normalized_html_table(text): def process_table_html(md_i): """ pred_md format edit """ def process_table_html(html_content): soup = BeautifulSoup(html_content, 'html.parser') th_tags = soup.find_all('th') for th in th_tags: th.name = 'td' thead_tags = soup.find_all('thead') for thead in thead_tags: thead.unwrap() # unwrap()会移除标签但保留其内容 math_tags = soup.find_all('math') for math_tag in math_tags: alttext = math_tag.get('alttext', '') alttext = f'${alttext}$' if alttext: math_tag.replace_with(alttext) span_tags = soup.find_all('span') for span in span_tags: span.unwrap() return str(soup) table_res='' table_res_no_space='' if ']*>(.*)' tables = re.findall(pattern, table_res, re.DOTALL | re.IGNORECASE) table_res = ''.join(tables) # table_res = re.sub('','',table_res) table_res = re.sub('( style=".*?")', "", table_res) table_res = re.sub('( style=".*?")', "", table_res) table_res = re.sub('( height=".*?")', "", table_res) table_res = re.sub('( width=".*?")', "", table_res) table_res = re.sub('( colwidth=".*?")', "", table_res) table_res = re.sub('( colheight=".*?")', "", table_res) table_res = re.sub('( rowwidth=".*?")', "", table_res) table_res = re.sub('( rowheight=".*?")', "", table_res) table_res = re.sub('( align=".*?")', "", table_res) table_res = re.sub('( class=".*?")', "", table_res) table_res = re.sub('( rowspan="1")', "", table_res) table_res = re.sub('( colspan="1")', "", table_res) table_res = re.sub('',"",table_res) table_res = re.sub(r'\s+', " ", table_res) table_res_no_space = '' + table_res.replace(' ','') + '
' # table_res_no_space = re.sub(' (style=".*?")',"",table_res_no_space) # table_res_no_space = re.sub(r'[ ]', " ", table_res_no_space) table_res_no_space = re.sub('colspan="', ' colspan="', table_res_no_space) table_res_no_space = re.sub('rowspan="', ' rowspan="', table_res_no_space) table_res_no_space = re.sub('border="', ' border="', table_res_no_space) table_res = '' + table_res + '
' # table_flow.append(table_res) # table_flow_no_space.append(table_res_no_space) return table_res, table_res_no_space def clean_table(input_str,flag=True): if flag: input_str = input_str.replace('', '').replace('', '') input_str = input_str.replace('', '').replace('', '') input_str = input_str.replace('', '').replace('', '') input_str = input_str.replace('
', '').replace('
', '') input_str = input_str.replace('

', '').replace('

', '') input_str = input_str.replace('', '') input_str = re.sub('.*?','',input_str) return input_str def process_formula(input_str): # 处理行内数学公式 inline_pattern = r'

' input_str = re.sub(inline_pattern, r'\(\1\)', input_str) # 处理块级数学公式(如果有的话) block_pattern = r'

' input_str = re.sub(block_pattern, r'\[\1\]', input_str) return input_str def process_uline(input_str): return input_str.replace('<u>', '').replace('</u>', '') text = process_formula(text) text = process_uline(text) norm_text, _ = process_table_html(text) norm_text = clean_table(norm_text) return norm_text.replace('> ', '>').replace(" ", "")