hbertrand commited on
Commit
0ff46a1
·
1 Parent(s): 49b1fb3
Files changed (1) hide show
  1. docparser.py +16 -17
docparser.py CHANGED
@@ -6,57 +6,56 @@ from bs4 import BeautifulSoup
6
 
7
 
8
  def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]:
9
- '''Parse all HTML files in `root_dir`, and extract all sections.
10
-
11
  Sections are broken into subsections if they are longer than `max_section_length`.
12
  Sections correspond to h2 HTML tags, and move on to h3 then h4 if needed.
13
- '''
14
- files = glob.glob('*.html', root_dir=root_dir)
15
 
16
- selector = 'section > section'
17
 
18
  # Recurse until sections are small enough
19
  def get_all_subsections(soup, selector: str) -> list[str]:
20
  found = soup.select(selector)
21
- data = [x.text.split(';')[-1].strip() for x in found]
22
 
23
  sections = []
24
  for i, section in enumerate(data):
25
  if len(section) > max_section_length:
26
- sections.extend(get_all_subsections(found[i], selector + ' > section'))
27
  else:
28
  sections.append(section)
29
-
30
- return sections
31
 
 
32
 
33
  sections = []
34
  for file in files:
35
  filepath = os.path.join(root_dir, file)
36
- with open(filepath, 'r') as file:
37
  source = file.read()
38
 
39
- soup = BeautifulSoup(source, 'html.parser')
40
  sections.extend(get_all_subsections(soup, selector))
41
-
42
  return sections
43
 
44
 
45
  def write_sections(filepath: str, sections: list[str]):
46
- with open(filepath, 'wb') as f:
47
  pickle.dump(sections, f)
48
 
49
 
50
  def read_sections(filepath: str) -> list[str]:
51
- with open (filepath, 'rb') as fp:
52
  sections = pickle.load(fp)
53
-
54
  return sections
55
 
56
 
57
  if __name__ == "__main__":
58
- root_dir = '/home/hadrien/perso/mila-docs/output/'
59
- save_filepath = os.path.join(root_dir, 'sections.pkl')
60
 
61
  # How to write
62
  sections = get_all_sections(root_dir)
 
6
 
7
 
8
  def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]:
9
+ """Parse all HTML files in `root_dir`, and extract all sections.
10
+
11
  Sections are broken into subsections if they are longer than `max_section_length`.
12
  Sections correspond to h2 HTML tags, and move on to h3 then h4 if needed.
13
+ """
14
+ files = glob.glob("*.html", root_dir=root_dir)
15
 
16
+ selector = "section > section"
17
 
18
  # Recurse until sections are small enough
19
  def get_all_subsections(soup, selector: str) -> list[str]:
20
  found = soup.select(selector)
21
+ data = [x.text.split(";")[-1].strip() for x in found]
22
 
23
  sections = []
24
  for i, section in enumerate(data):
25
  if len(section) > max_section_length:
26
+ sections.extend(get_all_subsections(found[i], selector + " > section"))
27
  else:
28
  sections.append(section)
 
 
29
 
30
+ return sections
31
 
32
  sections = []
33
  for file in files:
34
  filepath = os.path.join(root_dir, file)
35
+ with open(filepath, "r") as file:
36
  source = file.read()
37
 
38
+ soup = BeautifulSoup(source, "html.parser")
39
  sections.extend(get_all_subsections(soup, selector))
40
+
41
  return sections
42
 
43
 
44
  def write_sections(filepath: str, sections: list[str]):
45
+ with open(filepath, "wb") as f:
46
  pickle.dump(sections, f)
47
 
48
 
49
  def read_sections(filepath: str) -> list[str]:
50
+ with open(filepath, "rb") as fp:
51
  sections = pickle.load(fp)
52
+
53
  return sections
54
 
55
 
56
  if __name__ == "__main__":
57
+ root_dir = "/home/hadrien/perso/mila-docs/output/"
58
+ save_filepath = os.path.join(root_dir, "sections.pkl")
59
 
60
  # How to write
61
  sections = get_all_sections(root_dir)