Spaces:

wufan
/

Table_HTML_Render

Running

App Files Files Community

Table_HTML_Render / norm_html.py

wufan

Create norm_html.py

bc11e8d verified 19 days ago

raw

history blame contribute delete

4.64 kB

	import re
	import html
	import uuid
	import subprocess
	import unicodedata
	from bs4 import BeautifulSoup


	def normalized_html_table(text):
	def process_table_html(md_i):
	"""
	pred_md format edit
	"""
	def process_table_html(html_content):
	soup = BeautifulSoup(html_content, 'html.parser')
	th_tags = soup.find_all('th')
	for th in th_tags:
	th.name = 'td'
	thead_tags = soup.find_all('thead')
	for thead in thead_tags:
	thead.unwrap() # unwrap()会移除标签但保留其内容
	math_tags = soup.find_all('math')
	for math_tag in math_tags:
	alttext = math_tag.get('alttext', '')
	alttext = f'${alttext}$'
	if alttext:
	math_tag.replace_with(alttext)
	span_tags = soup.find_all('span')
	for span in span_tags:
	span.unwrap()
	return str(soup)

	table_res=''
	table_res_no_space=''
	if '<table' in md_i.replace(" ","").replace("'",'"'):
	md_i = process_table_html(md_i)
	table_res = html.unescape(md_i).replace('\n', '')
	table_res = unicodedata.normalize('NFKC', table_res).strip()
	pattern = r'<table\b[^>]>(.)</table>'
	tables = re.findall(pattern, table_res, re.DOTALL \| re.IGNORECASE)
	table_res = ''.join(tables)
	# table_res = re.sub('<table.*?>','',table_res)
	table_res = re.sub('( style=".*?")', "", table_res)
	table_res = re.sub('( style=".*?")', "", table_res)
	table_res = re.sub('( height=".*?")', "", table_res)
	table_res = re.sub('( width=".*?")', "", table_res)
	table_res = re.sub('( colwidth=".*?")', "", table_res)
	table_res = re.sub('( colheight=".*?")', "", table_res)
	table_res = re.sub('( rowwidth=".*?")', "", table_res)
	table_res = re.sub('( rowheight=".*?")', "", table_res)
	table_res = re.sub('( align=".*?")', "", table_res)
	table_res = re.sub('( class=".*?")', "", table_res)
	table_res = re.sub('( rowspan="1")', "", table_res)
	table_res = re.sub('( colspan="1")', "", table_res)
	table_res = re.sub('</?tbody>',"",table_res)

	table_res = re.sub(r'\s+', " ", table_res)
	table_res_no_space = '<html><body><table>' + table_res.replace(' ','') + '</table></body></html>'
	# table_res_no_space = re.sub(' (style=".*?")',"",table_res_no_space)
	# table_res_no_space = re.sub(r'[ ]', " ", table_res_no_space)
	table_res_no_space = re.sub('colspan="', ' colspan="', table_res_no_space)
	table_res_no_space = re.sub('rowspan="', ' rowspan="', table_res_no_space)
	table_res_no_space = re.sub('border="', ' border="', table_res_no_space)

	table_res = '<html><body><table>' + table_res + '</table></body></html>'
	# table_flow.append(table_res)
	# table_flow_no_space.append(table_res_no_space)

	return table_res, table_res_no_space

	def clean_table(input_str,flag=True):
	if flag:
	input_str = input_str.replace('<sup>', '').replace('</sup>', '')
	input_str = input_str.replace('<sub>', '').replace('</sub>', '')
	input_str = input_str.replace('<span>', '').replace('</span>', '')
	input_str = input_str.replace('<div>', '').replace('</div>', '')
	input_str = input_str.replace('<p>', '').replace('</p>', '')
	input_str = input_str.replace('<spandata-span-identity="">', '')
	input_str = re.sub('<colgroup>.*?</colgroup>','',input_str)
	return input_str

	def process_formula(input_str):
	# 处理行内数学公式
	inline_pattern = r'<p><span\s+data-latex="([^"]*)"\s+data-type="inline-math"></span></p>'
	input_str = re.sub(inline_pattern, r'$\1$', input_str)

	# 处理块级数学公式（如果有的话）
	block_pattern = r'<p><span\s+data-latex="([^"]*)"\s+data-type="display-math"></span></p>'
	input_str = re.sub(block_pattern, r'\[\1\]', input_str)
	return input_str

	def process_uline(input_str):
	return input_str.replace('<u>', '<u>').replace('</u>', '</u>')

	text = process_formula(text)
	text = process_uline(text)
	norm_text, _ = process_table_html(text)
	norm_text = clean_table(norm_text)
	return norm_text.replace('> ', '>').replace(" </td>", "</td>")