Spaces:
Runtime error
Runtime error
File size: 4,310 Bytes
49b1fb3 0b4f7e4 49b1fb3 e112463 49b1fb3 05dabf4 e112463 0b4f7e4 e112463 49b1fb3 90ae9dd 0b4f7e4 0ff46a1 49b1fb3 0ff46a1 49b1fb3 0b4f7e4 90ae9dd 49b1fb3 90ae9dd 0b4f7e4 90ae9dd 0b4f7e4 90ae9dd 0b4f7e4 90ae9dd 0b4f7e4 49b1fb3 0b4f7e4 49b1fb3 90ae9dd 0b4f7e4 49b1fb3 0b4f7e4 49b1fb3 90ae9dd 0b4f7e4 49b1fb3 0ff46a1 49b1fb3 0ff46a1 0b4f7e4 90ae9dd 0ff46a1 90ae9dd 0b4f7e4 49b1fb3 0b4f7e4 49b1fb3 0b4f7e4 49b1fb3 0ff46a1 0b4f7e4 05dabf4 49b1fb3 0b4f7e4 e112463 05dabf4 e112463 05dabf4 e112463 05dabf4 e112463 05dabf4 e112463 49b1fb3 0ff46a1 05dabf4 49b1fb3 0b4f7e4 49b1fb3 0b4f7e4 e112463 0b4f7e4 e112463 0b4f7e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import glob
import math
import os
import pandas as pd
import tiktoken
from bs4 import BeautifulSoup
from openai.embeddings_utils import get_embedding
EMBEDDING_MODEL = "text-embedding-ada-002"
EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
BASE_URL = "https://docs.mila.quebec/"
def get_all_documents(root_dir: str, max_section_length: int = 3000) -> pd.DataFrame:
"""Parse all HTML files in `root_dir`, and extract all sections.
Sections are broken into subsections if they are longer than `max_section_length`.
Sections correspond to h2 HTML tags, and move on to h3 then h4 if needed.
"""
files = glob.glob("*.html", root_dir=root_dir)
def get_all_subsections(soup: BeautifulSoup) -> tuple[list[str], list[str], list[str]]:
found = soup.find_all('a', href=True, class_="headerlink")
sections = []
urls = []
names = []
for section_found in found:
section_soup = section_found.parent.parent
section_href = section_soup.find_all('a', href=True, class_="headerlink")
# If sections has subsections, keep only the part before the first subsection
if len(section_href) > 1:
section_siblings = section_soup.section.previous_siblings
section = [sibling.text for sibling in section_siblings]
section = ''.join(section[::-1])[1:]
else:
section = section_soup.text[1:]
url = section_found['href']
name = section_found.parent.text[:-1]
# If text is too long, split into chunks of equal sizes
if len(section) > max_section_length:
n_chunks = math.ceil(len(section) / float(max_section_length))
separator_index = math.floor(len(section) / n_chunks)
section_chunks = [section[separator_index * i: separator_index * (i + 1)] for i in range(n_chunks)]
url_chunks = [url] * n_chunks
name_chunks = [name] * n_chunks
sections.extend(section_chunks)
urls.extend(url_chunks)
names.extend(name_chunks)
else:
sections.append(section)
urls.append(url)
names.append(name)
return sections, urls, names
sections = []
urls = []
names = []
for file in files:
filepath = os.path.join(root_dir, file)
with open(filepath, "r") as file:
source = file.read()
soup = BeautifulSoup(source, "html.parser")
sections_file, urls_file, names_file = get_all_subsections(soup)
sections.extend(sections_file)
urls_file = [BASE_URL + os.path.basename(file.name) + url for url in urls_file]
urls.extend(urls_file)
names.extend(names_file)
documents_df = pd.DataFrame.from_dict({
'name': names,
'url': urls,
'text': sections
})
return documents_df
def write_documents(filepath: str, documents_df: pd.DataFrame):
documents_df.to_csv(filepath, index=False)
def read_documents(filepath: str) -> pd.DataFrame:
return pd.read_csv(filepath)
def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
df["n_tokens"] = df.text.apply(lambda x: len(encoding.encode(x)))
return df
def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame:
df["embedding"] = df.text.apply(lambda x: get_embedding(x, engine=EMBEDDING_MODEL))
return df
def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame:
# Get all documents and precompute their embeddings
df = read_documents(filepath)
df = compute_n_tokens(df)
df = precompute_embeddings(df)
write_documents(output_csv, df)
return df
if __name__ == "__main__":
root_dir = "/home/hadrien/perso/mila-docs/output/"
save_filepath = "data/documents.csv"
# How to write
documents_df = get_all_documents(root_dir)
write_documents(save_filepath, documents_df)
# How to load
documents_df = read_documents(save_filepath)
# precompute the document embeddings
df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")
|