File size: 4,310 Bytes
49b1fb3
0b4f7e4
49b1fb3
 
e112463
 
49b1fb3
05dabf4
e112463
0b4f7e4
e112463
 
49b1fb3
 
90ae9dd
 
 
0b4f7e4
0ff46a1
 
49b1fb3
 
0ff46a1
 
49b1fb3
0b4f7e4
90ae9dd
49b1fb3
 
90ae9dd
0b4f7e4
90ae9dd
 
0b4f7e4
 
 
 
 
 
 
 
 
 
90ae9dd
0b4f7e4
90ae9dd
0b4f7e4
49b1fb3
0b4f7e4
 
 
 
 
 
 
 
 
 
49b1fb3
 
90ae9dd
0b4f7e4
49b1fb3
0b4f7e4
49b1fb3
 
90ae9dd
0b4f7e4
49b1fb3
 
0ff46a1
49b1fb3
 
0ff46a1
0b4f7e4
90ae9dd
0ff46a1
90ae9dd
 
 
0b4f7e4
49b1fb3
0b4f7e4
 
 
 
 
49b1fb3
0b4f7e4
49b1fb3
0ff46a1
0b4f7e4
05dabf4
49b1fb3
 
0b4f7e4
 
e112463
 
 
 
05dabf4
e112463
 
 
 
05dabf4
e112463
 
 
 
 
05dabf4
e112463
 
05dabf4
e112463
 
 
49b1fb3
0ff46a1
05dabf4
49b1fb3
 
0b4f7e4
 
49b1fb3
 
0b4f7e4
e112463
0b4f7e4
e112463
0b4f7e4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import glob
import math
import os

import pandas as pd
import tiktoken
from bs4 import BeautifulSoup
from openai.embeddings_utils import get_embedding


EMBEDDING_MODEL = "text-embedding-ada-002"
EMBEDDING_ENCODING = "cl100k_base"  # this the encoding for text-embedding-ada-002


BASE_URL = "https://docs.mila.quebec/"


def get_all_documents(root_dir: str, max_section_length: int = 3000) -> pd.DataFrame:
    """Parse all HTML files in `root_dir`, and extract all sections.

    Sections are broken into subsections if they are longer than `max_section_length`.
    Sections correspond to h2 HTML tags, and move on to h3 then h4 if needed.
    """
    files = glob.glob("*.html", root_dir=root_dir)

    def get_all_subsections(soup: BeautifulSoup) -> tuple[list[str], list[str], list[str]]:
        found = soup.find_all('a', href=True, class_="headerlink")

        sections = []
        urls = []
        names = []
        for section_found in found:
            section_soup = section_found.parent.parent
            section_href = section_soup.find_all('a', href=True, class_="headerlink")

            # If sections has subsections, keep only the part before the first subsection
            if len(section_href) > 1:
                section_siblings = section_soup.section.previous_siblings
                section = [sibling.text for sibling in section_siblings]
                section = ''.join(section[::-1])[1:]
            else:
                section = section_soup.text[1:]

            url = section_found['href']
            name = section_found.parent.text[:-1]

            # If text is too long, split into chunks of equal sizes
            if len(section) > max_section_length:
                n_chunks = math.ceil(len(section) / float(max_section_length))
                separator_index = math.floor(len(section) / n_chunks)

                section_chunks = [section[separator_index * i: separator_index * (i + 1)] for i in range(n_chunks)]
                url_chunks = [url] * n_chunks
                name_chunks = [name] * n_chunks

                sections.extend(section_chunks)
                urls.extend(url_chunks)
                names.extend(name_chunks)
            else:
                sections.append(section)
                urls.append(url)
                names.append(name)

        return sections, urls, names

    sections = []
    urls = []
    names = []
    for file in files:
        filepath = os.path.join(root_dir, file)
        with open(filepath, "r") as file:
            source = file.read()

        soup = BeautifulSoup(source, "html.parser")
        sections_file, urls_file, names_file = get_all_subsections(soup)
        sections.extend(sections_file)

        urls_file = [BASE_URL + os.path.basename(file.name) + url for url in urls_file]
        urls.extend(urls_file)

        names.extend(names_file)

    documents_df = pd.DataFrame.from_dict({
        'name': names,
        'url': urls,
        'text': sections
    })

    return documents_df


def write_documents(filepath: str, documents_df: pd.DataFrame):
    documents_df.to_csv(filepath, index=False)


def read_documents(filepath: str) -> pd.DataFrame:
    return pd.read_csv(filepath)


def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
    encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
    df["n_tokens"] = df.text.apply(lambda x: len(encoding.encode(x)))
    return df


def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame:
    df["embedding"] = df.text.apply(lambda x: get_embedding(x, engine=EMBEDDING_MODEL))
    return df


def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame:
    # Get all documents and precompute their embeddings
    df = read_documents(filepath)
    df = compute_n_tokens(df)
    df = precompute_embeddings(df)
    write_documents(output_csv, df)
    return df


if __name__ == "__main__":
    root_dir = "/home/hadrien/perso/mila-docs/output/"
    save_filepath = "data/documents.csv"

    # How to write
    documents_df = get_all_documents(root_dir)
    write_documents(save_filepath, documents_df)

    # How to load
    documents_df = read_documents(save_filepath)

    # precompute the document embeddings
    df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")