Spaces:
Running
Running
import re | |
import html | |
import uuid | |
import subprocess | |
import unicodedata | |
from bs4 import BeautifulSoup | |
def normalized_html_table(text): | |
def process_table_html(md_i): | |
""" | |
pred_md format edit | |
""" | |
def process_table_html(html_content): | |
soup = BeautifulSoup(html_content, 'html.parser') | |
th_tags = soup.find_all('th') | |
for th in th_tags: | |
th.name = 'td' | |
thead_tags = soup.find_all('thead') | |
for thead in thead_tags: | |
thead.unwrap() # unwrap()会移除标签但保留其内容 | |
math_tags = soup.find_all('math') | |
for math_tag in math_tags: | |
alttext = math_tag.get('alttext', '') | |
alttext = f'${alttext}$' | |
if alttext: | |
math_tag.replace_with(alttext) | |
span_tags = soup.find_all('span') | |
for span in span_tags: | |
span.unwrap() | |
return str(soup) | |
table_res='' | |
table_res_no_space='' | |
if '<table' in md_i.replace(" ","").replace("'",'"'): | |
md_i = process_table_html(md_i) | |
table_res = html.unescape(md_i).replace('\n', '') | |
table_res = unicodedata.normalize('NFKC', table_res).strip() | |
pattern = r'<table\b[^>]*>(.*)</table>' | |
tables = re.findall(pattern, table_res, re.DOTALL | re.IGNORECASE) | |
table_res = ''.join(tables) | |
# table_res = re.sub('<table.*?>','',table_res) | |
table_res = re.sub('( style=".*?")', "", table_res) | |
table_res = re.sub('( style=".*?")', "", table_res) | |
table_res = re.sub('( height=".*?")', "", table_res) | |
table_res = re.sub('( width=".*?")', "", table_res) | |
table_res = re.sub('( colwidth=".*?")', "", table_res) | |
table_res = re.sub('( colheight=".*?")', "", table_res) | |
table_res = re.sub('( rowwidth=".*?")', "", table_res) | |
table_res = re.sub('( rowheight=".*?")', "", table_res) | |
table_res = re.sub('( align=".*?")', "", table_res) | |
table_res = re.sub('( class=".*?")', "", table_res) | |
table_res = re.sub('( rowspan="1")', "", table_res) | |
table_res = re.sub('( colspan="1")', "", table_res) | |
table_res = re.sub('</?tbody>',"",table_res) | |
table_res = re.sub(r'\s+', " ", table_res) | |
table_res_no_space = '<html><body><table>' + table_res.replace(' ','') + '</table></body></html>' | |
# table_res_no_space = re.sub(' (style=".*?")',"",table_res_no_space) | |
# table_res_no_space = re.sub(r'[ ]', " ", table_res_no_space) | |
table_res_no_space = re.sub('colspan="', ' colspan="', table_res_no_space) | |
table_res_no_space = re.sub('rowspan="', ' rowspan="', table_res_no_space) | |
table_res_no_space = re.sub('border="', ' border="', table_res_no_space) | |
table_res = '<html><body><table>' + table_res + '</table></body></html>' | |
# table_flow.append(table_res) | |
# table_flow_no_space.append(table_res_no_space) | |
return table_res, table_res_no_space | |
def clean_table(input_str,flag=True): | |
if flag: | |
input_str = input_str.replace('<sup>', '').replace('</sup>', '') | |
input_str = input_str.replace('<sub>', '').replace('</sub>', '') | |
input_str = input_str.replace('<span>', '').replace('</span>', '') | |
input_str = input_str.replace('<div>', '').replace('</div>', '') | |
input_str = input_str.replace('<p>', '').replace('</p>', '') | |
input_str = input_str.replace('<spandata-span-identity="">', '') | |
input_str = re.sub('<colgroup>.*?</colgroup>','',input_str) | |
return input_str | |
def process_formula(input_str): | |
# 处理行内数学公式 | |
inline_pattern = r'<p><span\s+data-latex="([^"]*)"\s+data-type="inline-math"></span></p>' | |
input_str = re.sub(inline_pattern, r'\(\1\)', input_str) | |
# 处理块级数学公式(如果有的话) | |
block_pattern = r'<p><span\s+data-latex="([^"]*)"\s+data-type="display-math"></span></p>' | |
input_str = re.sub(block_pattern, r'\[\1\]', input_str) | |
return input_str | |
def process_uline(input_str): | |
return input_str.replace('<u>', '<u>').replace('</u>', '</u>') | |
text = process_formula(text) | |
text = process_uline(text) | |
norm_text, _ = process_table_html(text) | |
norm_text = clean_table(norm_text) | |
return norm_text.replace('> ', '>').replace(" </td>", "</td>") |