File size: 1,860 Bytes
49b1fb3
 
 
 
 
 
 
 
0ff46a1
 
49b1fb3
 
0ff46a1
 
49b1fb3
0ff46a1
49b1fb3
 
 
 
0ff46a1
49b1fb3
 
 
 
0ff46a1
49b1fb3
 
 
0ff46a1
49b1fb3
 
 
 
0ff46a1
49b1fb3
 
0ff46a1
49b1fb3
0ff46a1
49b1fb3
 
 
 
0ff46a1
49b1fb3
 
 
 
0ff46a1
49b1fb3
0ff46a1
49b1fb3
 
 
 
0ff46a1
 
49b1fb3
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import glob
import os
import pickle

from bs4 import BeautifulSoup


def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]:
    """Parse all HTML files in `root_dir`, and extract all sections.

    Sections are broken into subsections if they are longer than `max_section_length`.
    Sections correspond to h2 HTML tags, and move on to h3 then h4 if needed.
    """
    files = glob.glob("*.html", root_dir=root_dir)

    selector = "section > section"

    # Recurse until sections are small enough
    def get_all_subsections(soup, selector: str) -> list[str]:
        found = soup.select(selector)
        data = [x.text.split(";")[-1].strip() for x in found]

        sections = []
        for i, section in enumerate(data):
            if len(section) > max_section_length:
                sections.extend(get_all_subsections(found[i], selector + " > section"))
            else:
                sections.append(section)

        return sections

    sections = []
    for file in files:
        filepath = os.path.join(root_dir, file)
        with open(filepath, "r") as file:
            source = file.read()

        soup = BeautifulSoup(source, "html.parser")
        sections.extend(get_all_subsections(soup, selector))

    return sections


def write_sections(filepath: str, sections: list[str]):
    with open(filepath, "wb") as f:
        pickle.dump(sections, f)


def read_sections(filepath: str) -> list[str]:
    with open(filepath, "rb") as fp:
        sections = pickle.load(fp)

    return sections


if __name__ == "__main__":
    root_dir = "/home/hadrien/perso/mila-docs/output/"
    save_filepath = os.path.join(root_dir, "sections.pkl")

    # How to write
    sections = get_all_sections(root_dir)
    write_sections(save_filepath, sections)

    # How to load
    sections = read_sections(save_filepath)