jerpint commited on
Commit
bed2402
·
unverified ·
2 Parent(s): 21acbea 0ff46a1

Merge pull request #1 from jerpint/parse_docs

Browse files
Files changed (1) hide show
  1. docparser.py +65 -0
docparser.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import os
3
+ import pickle
4
+
5
+ from bs4 import BeautifulSoup
6
+
7
+
8
+ def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]:
9
+ """Parse all HTML files in `root_dir`, and extract all sections.
10
+
11
+ Sections are broken into subsections if they are longer than `max_section_length`.
12
+ Sections correspond to h2 HTML tags, and move on to h3 then h4 if needed.
13
+ """
14
+ files = glob.glob("*.html", root_dir=root_dir)
15
+
16
+ selector = "section > section"
17
+
18
+ # Recurse until sections are small enough
19
+ def get_all_subsections(soup, selector: str) -> list[str]:
20
+ found = soup.select(selector)
21
+ data = [x.text.split(";")[-1].strip() for x in found]
22
+
23
+ sections = []
24
+ for i, section in enumerate(data):
25
+ if len(section) > max_section_length:
26
+ sections.extend(get_all_subsections(found[i], selector + " > section"))
27
+ else:
28
+ sections.append(section)
29
+
30
+ return sections
31
+
32
+ sections = []
33
+ for file in files:
34
+ filepath = os.path.join(root_dir, file)
35
+ with open(filepath, "r") as file:
36
+ source = file.read()
37
+
38
+ soup = BeautifulSoup(source, "html.parser")
39
+ sections.extend(get_all_subsections(soup, selector))
40
+
41
+ return sections
42
+
43
+
44
+ def write_sections(filepath: str, sections: list[str]):
45
+ with open(filepath, "wb") as f:
46
+ pickle.dump(sections, f)
47
+
48
+
49
+ def read_sections(filepath: str) -> list[str]:
50
+ with open(filepath, "rb") as fp:
51
+ sections = pickle.load(fp)
52
+
53
+ return sections
54
+
55
+
56
+ if __name__ == "__main__":
57
+ root_dir = "/home/hadrien/perso/mila-docs/output/"
58
+ save_filepath = os.path.join(root_dir, "sections.pkl")
59
+
60
+ # How to write
61
+ sections = get_all_sections(root_dir)
62
+ write_sections(save_filepath, sections)
63
+
64
+ # How to load
65
+ sections = read_sections(save_filepath)