import glob import math import os import bs4 import pandas as pd import tiktoken from bs4 import BeautifulSoup from openai.embeddings_utils import get_embedding EMBEDDING_MODEL = "text-embedding-ada-002" EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002 BASE_URL_MILA = "https://docs.mila.quebec/" BASE_URL_ORION = "https://orion.readthedocs.io/en/stable/" BASE_URL_PYTORCH = "https://pytorch.org/docs/stable/" PICKLE_EXTENSIONS = [".gz", ".bz2", ".zip", ".xz", ".zst", ".tar", ".tar.gz", ".tar.xz", ".tar.bz2"] def parse_section(nodes: list[bs4.element.NavigableString]) -> str: section = [] for node in nodes: if node.name == "table": node_text = pd.read_html(node.prettify())[0].to_markdown(index=False, tablefmt="github") else: node_text = node.text section.append(node_text) section = "".join(section)[1:] return section def get_all_documents( root_dir: str, base_url: str, min_section_length: int = 100, max_section_length: int = 2000 ) -> pd.DataFrame: """Parse all HTML files in `root_dir`, and extract all sections. Sections are broken into subsections if they are longer than `max_section_length`. Sections correspond to `section` HTML tags that have a headerlink attached. """ files = glob.glob("**/*.html", root_dir=root_dir, recursive=True) def get_all_subsections(soup: BeautifulSoup) -> tuple[list[str], list[str], list[str]]: found = soup.find_all("a", href=True, class_="headerlink") sections = [] urls = [] names = [] for section_found in found: section_soup = section_found.parent.parent section_href = section_soup.find_all("a", href=True, class_="headerlink") # If sections has subsections, keep only the part before the first subsection if len(section_href) > 1 and section_soup.section is not None: section_siblings = list(section_soup.section.previous_siblings)[::-1] section = parse_section(section_siblings) else: section = parse_section(section_soup.children) # Remove special characters, plus newlines in some url and section names. section = section.strip() url = section_found["href"].strip().replace("\n", "") name = section_found.parent.text.strip()[:-1].replace("\n", "") # If text is too long, split into chunks of equal sizes if len(section) > max_section_length: n_chunks = math.ceil(len(section) / float(max_section_length)) separator_index = math.floor(len(section) / n_chunks) section_chunks = [section[separator_index * i : separator_index * (i + 1)] for i in range(n_chunks)] url_chunks = [url] * n_chunks name_chunks = [name] * n_chunks sections.extend(section_chunks) urls.extend(url_chunks) names.extend(name_chunks) # If text is not too short, add in 1 chunk elif len(section) > min_section_length: sections.append(section) urls.append(url) names.append(name) return sections, urls, names sections = [] urls = [] names = [] for file in files: filepath = os.path.join(root_dir, file) with open(filepath, "r") as f: source = f.read() soup = BeautifulSoup(source, "html.parser") sections_file, urls_file, names_file = get_all_subsections(soup) sections.extend(sections_file) urls_file = [base_url + file + url for url in urls_file] urls.extend(urls_file) names.extend(names_file) documents_df = pd.DataFrame.from_dict({"name": names, "url": urls, "text": sections}) return documents_df def get_file_extension(filepath: str) -> str: return os.path.splitext(filepath)[1] def write_documents(filepath: str, documents_df: pd.DataFrame): ext = get_file_extension(filepath) if ext == ".csv": documents_df.to_csv(filepath, index=False) elif ext in PICKLE_EXTENSIONS: documents_df.to_pickle(filepath) else: raise ValueError(f"Unsupported format: {ext}.") def read_documents(filepath: str) -> pd.DataFrame: ext = get_file_extension(filepath) if ext == ".csv": return pd.read_csv(filepath) elif ext in PICKLE_EXTENSIONS: return pd.read_pickle(filepath) else: raise ValueError(f"Unsupported format: {ext}.") def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame: encoding = tiktoken.get_encoding(EMBEDDING_ENCODING) df["n_tokens"] = df.text.apply(lambda x: len(encoding.encode(x))) return df def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame: df["embedding"] = df.text.apply(lambda x: get_embedding(x, engine=EMBEDDING_MODEL)) return df def generate_embeddings(filepath: str, output_file: str) -> pd.DataFrame: # Get all documents and precompute their embeddings df = read_documents(filepath) df = compute_n_tokens(df) df = precompute_embeddings(df) write_documents(output_file, df) return df if __name__ == "__main__": root_dir = "/home/hadrien/perso/mila-docs/output/" save_filepath = "data/documents.tar.gz" # How to write documents_df = get_all_documents(root_dir) write_documents(save_filepath, documents_df) # How to load documents_df = read_documents(save_filepath) # precompute the document embeddings df = generate_embeddings(filepath=save_filepath, output_file="data/document_embeddings.tar.gz")