import glob import os import pickle import pandas as pd import tiktoken from bs4 import BeautifulSoup from openai.embeddings_utils import cosine_similarity, get_embedding EMBEDDING_MODEL = "text-embedding-ada-002" EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002 BASE_URL = "https://docs.mila.quebec/" def get_all_sections(root_dir: str, max_section_length: int = 3000) -> tuple[list[str], list[str]]: """Parse all HTML files in `root_dir`, and extract all sections. Sections are broken into subsections if they are longer than `max_section_length`. Sections correspond to h2 HTML tags, and move on to h3 then h4 if needed. """ files = glob.glob("*.html", root_dir=root_dir) # Recurse until sections are small enough def get_all_subsections(soup: BeautifulSoup, level: int) -> tuple[list[str], list[str]]: if level >= 5: return [], [] found = soup.find_all('a', href=True, class_="headerlink") sections = [] urls = [] for section_found in found: section_soup = section_found.parent.parent section = section_soup.text url = section_found['href'] if len(section) > max_section_length: s, u = get_all_subsections(section_soup, level + 1) sections.extend(s) urls.extend(u) else: sections.append(section) urls.append(url) return sections, urls sections = [] urls = [] for file in files: filepath = os.path.join(root_dir, file) with open(filepath, "r") as file: source = file.read() soup = BeautifulSoup(source, "html.parser") sections_file, urls_file = get_all_subsections(soup, 2) sections.extend(sections_file) urls_file = [BASE_URL + os.path.basename(file.name) + url for url in urls_file] urls.extend(urls_file) return sections, urls def write_sections(filepath: str, sections: list[str]): with open(filepath, "wb") as f: pickle.dump(sections, f) def read_sections(filepath: str) -> list[str]: with open(filepath, "rb") as fp: sections = pickle.load(fp) return sections def load_documents(fname: str) -> pd.DataFrame: df = pd.DataFrame() with open(fname, "rb") as fp: documents = pickle.load(fp) df["documents"] = documents return df def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame: encoding = tiktoken.get_encoding(EMBEDDING_ENCODING) df["n_tokens"] = df.documents.apply(lambda x: len(encoding.encode(x))) return df def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame: df["embedding"] = df.documents.apply(lambda x: get_embedding(x, engine=EMBEDDING_MODEL)) return df def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame: # Get all documents and precompute their embeddings df = load_documents(filepath) df = compute_n_tokens(df) df = precompute_embeddings(df) df.to_csv(output_csv) return df if __name__ == "__main__": root_dir = "/home/hadrien/perso/mila-docs/output/" save_filepath = os.path.join(root_dir, "sections.pkl") # How to write sections = get_all_sections(root_dir) write_sections(save_filepath, sections) # How to load sections = read_sections(save_filepath) # precopmute the document embeddings df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")