Spaces:

jerpint
/

buster-dev

Runtime error

File size: 1,860 Bytes

49b1fb3
 
 
 
 
 
 
 
0ff46a1
 
49b1fb3
 
0ff46a1
 
49b1fb3
0ff46a1
49b1fb3
 
 
 
0ff46a1
49b1fb3
 
 
 
0ff46a1
49b1fb3
 
 
0ff46a1
49b1fb3
 
 
 
0ff46a1
49b1fb3
 
0ff46a1
49b1fb3
0ff46a1
49b1fb3
 
 
 
0ff46a1
49b1fb3
 
 
 
0ff46a1
49b1fb3
0ff46a1
49b1fb3
 
 
 
0ff46a1
 
49b1fb3

import glob
import os
import pickle

from bs4 import BeautifulSoup


def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]:
    """Parse all HTML files in `root_dir`, and extract all sections.

    Sections are broken into subsections if they are longer than `max_section_length`.
    Sections correspond to h2 HTML tags, and move on to h3 then h4 if needed.
    """
    files = glob.glob("*.html", root_dir=root_dir)

    selector = "section > section"

    # Recurse until sections are small enough
    def get_all_subsections(soup, selector: str) -> list[str]:
        found = soup.select(selector)
        data = [x.text.split(";")[-1].strip() for x in found]

        sections = []
        for i, section in enumerate(data):
            if len(section) > max_section_length:
                sections.extend(get_all_subsections(found[i], selector + " > section"))
            else:
                sections.append(section)

        return sections

    sections = []
    for file in files:
        filepath = os.path.join(root_dir, file)
        with open(filepath, "r") as file:
            source = file.read()

        soup = BeautifulSoup(source, "html.parser")
        sections.extend(get_all_subsections(soup, selector))

    return sections


def write_sections(filepath: str, sections: list[str]):
    with open(filepath, "wb") as f:
        pickle.dump(sections, f)


def read_sections(filepath: str) -> list[str]:
    with open(filepath, "rb") as fp:
        sections = pickle.load(fp)

    return sections


if __name__ == "__main__":
    root_dir = "/home/hadrien/perso/mila-docs/output/"
    save_filepath = os.path.join(root_dir, "sections.pkl")

    # How to write
    sections = get_all_sections(root_dir)
    write_sections(save_filepath, sections)

    # How to load
    sections = read_sections(save_filepath)