wufan commited on
Commit
bc11e8d
·
verified ·
1 Parent(s): e2dd7e6

Create norm_html.py

Browse files
Files changed (1) hide show
  1. norm_html.py +99 -0
norm_html.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import html
3
+ import uuid
4
+ import subprocess
5
+ import unicodedata
6
+ from bs4 import BeautifulSoup
7
+
8
+
9
+ def normalized_html_table(text):
10
+ def process_table_html(md_i):
11
+ """
12
+ pred_md format edit
13
+ """
14
+ def process_table_html(html_content):
15
+ soup = BeautifulSoup(html_content, 'html.parser')
16
+ th_tags = soup.find_all('th')
17
+ for th in th_tags:
18
+ th.name = 'td'
19
+ thead_tags = soup.find_all('thead')
20
+ for thead in thead_tags:
21
+ thead.unwrap() # unwrap()会移除标签但保留其内容
22
+ math_tags = soup.find_all('math')
23
+ for math_tag in math_tags:
24
+ alttext = math_tag.get('alttext', '')
25
+ alttext = f'${alttext}$'
26
+ if alttext:
27
+ math_tag.replace_with(alttext)
28
+ span_tags = soup.find_all('span')
29
+ for span in span_tags:
30
+ span.unwrap()
31
+ return str(soup)
32
+
33
+ table_res=''
34
+ table_res_no_space=''
35
+ if '<table' in md_i.replace(" ","").replace("'",'"'):
36
+ md_i = process_table_html(md_i)
37
+ table_res = html.unescape(md_i).replace('\n', '')
38
+ table_res = unicodedata.normalize('NFKC', table_res).strip()
39
+ pattern = r'<table\b[^>]*>(.*)</table>'
40
+ tables = re.findall(pattern, table_res, re.DOTALL | re.IGNORECASE)
41
+ table_res = ''.join(tables)
42
+ # table_res = re.sub('<table.*?>','',table_res)
43
+ table_res = re.sub('( style=".*?")', "", table_res)
44
+ table_res = re.sub('( style=".*?")', "", table_res)
45
+ table_res = re.sub('( height=".*?")', "", table_res)
46
+ table_res = re.sub('( width=".*?")', "", table_res)
47
+ table_res = re.sub('( colwidth=".*?")', "", table_res)
48
+ table_res = re.sub('( colheight=".*?")', "", table_res)
49
+ table_res = re.sub('( rowwidth=".*?")', "", table_res)
50
+ table_res = re.sub('( rowheight=".*?")', "", table_res)
51
+ table_res = re.sub('( align=".*?")', "", table_res)
52
+ table_res = re.sub('( class=".*?")', "", table_res)
53
+ table_res = re.sub('( rowspan="1")', "", table_res)
54
+ table_res = re.sub('( colspan="1")', "", table_res)
55
+ table_res = re.sub('</?tbody>',"",table_res)
56
+
57
+ table_res = re.sub(r'\s+', " ", table_res)
58
+ table_res_no_space = '<html><body><table>' + table_res.replace(' ','') + '</table></body></html>'
59
+ # table_res_no_space = re.sub(' (style=".*?")',"",table_res_no_space)
60
+ # table_res_no_space = re.sub(r'[ ]', " ", table_res_no_space)
61
+ table_res_no_space = re.sub('colspan="', ' colspan="', table_res_no_space)
62
+ table_res_no_space = re.sub('rowspan="', ' rowspan="', table_res_no_space)
63
+ table_res_no_space = re.sub('border="', ' border="', table_res_no_space)
64
+
65
+ table_res = '<html><body><table>' + table_res + '</table></body></html>'
66
+ # table_flow.append(table_res)
67
+ # table_flow_no_space.append(table_res_no_space)
68
+
69
+ return table_res, table_res_no_space
70
+
71
+ def clean_table(input_str,flag=True):
72
+ if flag:
73
+ input_str = input_str.replace('<sup>', '').replace('</sup>', '')
74
+ input_str = input_str.replace('<sub>', '').replace('</sub>', '')
75
+ input_str = input_str.replace('<span>', '').replace('</span>', '')
76
+ input_str = input_str.replace('<div>', '').replace('</div>', '')
77
+ input_str = input_str.replace('<p>', '').replace('</p>', '')
78
+ input_str = input_str.replace('<spandata-span-identity="">', '')
79
+ input_str = re.sub('<colgroup>.*?</colgroup>','',input_str)
80
+ return input_str
81
+
82
+ def process_formula(input_str):
83
+ # 处理行内数学公式
84
+ inline_pattern = r'<p><span\s+data-latex="([^"]*)"\s+data-type="inline-math"></span></p>'
85
+ input_str = re.sub(inline_pattern, r'\(\1\)', input_str)
86
+
87
+ # 处理块级数学公式(如果有的话)
88
+ block_pattern = r'<p><span\s+data-latex="([^"]*)"\s+data-type="display-math"></span></p>'
89
+ input_str = re.sub(block_pattern, r'\[\1\]', input_str)
90
+ return input_str
91
+
92
+ def process_uline(input_str):
93
+ return input_str.replace('&lt;u&gt;', '<u>').replace('&lt;/u&gt;', '</u>')
94
+
95
+ text = process_formula(text)
96
+ text = process_uline(text)
97
+ norm_text, _ = process_table_html(text)
98
+ norm_text = clean_table(norm_text)
99
+ return norm_text.replace('> ', '>').replace(" </td>", "</td>")