hbertrand commited on
Commit
49b1fb3
·
1 Parent(s): 21acbea
Files changed (1) hide show
  1. docparser.py +66 -0
docparser.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import os
3
+ import pickle
4
+
5
+ from bs4 import BeautifulSoup
6
+
7
+
8
+ def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]:
9
+ '''Parse all HTML files in `root_dir`, and extract all sections.
10
+
11
+ Sections are broken into subsections if they are longer than `max_section_length`.
12
+ Sections correspond to h2 HTML tags, and move on to h3 then h4 if needed.
13
+ '''
14
+ files = glob.glob('*.html', root_dir=root_dir)
15
+
16
+ selector = 'section > section'
17
+
18
+ # Recurse until sections are small enough
19
+ def get_all_subsections(soup, selector: str) -> list[str]:
20
+ found = soup.select(selector)
21
+ data = [x.text.split(';')[-1].strip() for x in found]
22
+
23
+ sections = []
24
+ for i, section in enumerate(data):
25
+ if len(section) > max_section_length:
26
+ sections.extend(get_all_subsections(found[i], selector + ' > section'))
27
+ else:
28
+ sections.append(section)
29
+
30
+ return sections
31
+
32
+
33
+ sections = []
34
+ for file in files:
35
+ filepath = os.path.join(root_dir, file)
36
+ with open(filepath, 'r') as file:
37
+ source = file.read()
38
+
39
+ soup = BeautifulSoup(source, 'html.parser')
40
+ sections.extend(get_all_subsections(soup, selector))
41
+
42
+ return sections
43
+
44
+
45
+ def write_sections(filepath: str, sections: list[str]):
46
+ with open(filepath, 'wb') as f:
47
+ pickle.dump(sections, f)
48
+
49
+
50
+ def read_sections(filepath: str) -> list[str]:
51
+ with open (filepath, 'rb') as fp:
52
+ sections = pickle.load(fp)
53
+
54
+ return sections
55
+
56
+
57
+ if __name__ == "__main__":
58
+ root_dir = '/home/hadrien/perso/mila-docs/output/'
59
+ save_filepath = os.path.join(root_dir, 'sections.pkl')
60
+
61
+ # How to write
62
+ sections = get_all_sections(root_dir)
63
+ write_sections(save_filepath, sections)
64
+
65
+ # How to load
66
+ sections = read_sections(save_filepath)