Spaces:
Runtime error
Runtime error
File size: 3,549 Bytes
49b1fb3 e112463 49b1fb3 e112463 49b1fb3 90ae9dd 0ff46a1 49b1fb3 0ff46a1 49b1fb3 90ae9dd 49b1fb3 90ae9dd 49b1fb3 90ae9dd 49b1fb3 90ae9dd 49b1fb3 90ae9dd 49b1fb3 90ae9dd 49b1fb3 0ff46a1 49b1fb3 0ff46a1 90ae9dd 0ff46a1 90ae9dd 49b1fb3 0ff46a1 49b1fb3 0ff46a1 49b1fb3 0ff46a1 49b1fb3 e112463 49b1fb3 0ff46a1 49b1fb3 e112463 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import glob
import os
import pickle
import pandas as pd
import tiktoken
from bs4 import BeautifulSoup
from openai.embeddings_utils import cosine_similarity, get_embedding
EMBEDDING_MODEL = "text-embedding-ada-002"
EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
BASE_URL = "https://docs.mila.quebec/"
def get_all_sections(root_dir: str, max_section_length: int = 3000) -> tuple[list[str], list[str]]:
"""Parse all HTML files in `root_dir`, and extract all sections.
Sections are broken into subsections if they are longer than `max_section_length`.
Sections correspond to h2 HTML tags, and move on to h3 then h4 if needed.
"""
files = glob.glob("*.html", root_dir=root_dir)
# Recurse until sections are small enough
def get_all_subsections(soup: BeautifulSoup, level: int) -> tuple[list[str], list[str]]:
if level >= 5:
return [], []
found = soup.find_all('a', href=True, class_="headerlink")
sections = []
urls = []
for section_found in found:
section_soup = section_found.parent.parent
section = section_soup.text
url = section_found['href']
if len(section) > max_section_length:
s, u = get_all_subsections(section_soup, level + 1)
sections.extend(s)
urls.extend(u)
else:
sections.append(section)
urls.append(url)
return sections, urls
sections = []
urls = []
for file in files:
filepath = os.path.join(root_dir, file)
with open(filepath, "r") as file:
source = file.read()
soup = BeautifulSoup(source, "html.parser")
sections_file, urls_file = get_all_subsections(soup, 2)
sections.extend(sections_file)
urls_file = [BASE_URL + os.path.basename(file.name) + url for url in urls_file]
urls.extend(urls_file)
return sections, urls
def write_sections(filepath: str, sections: list[str]):
with open(filepath, "wb") as f:
pickle.dump(sections, f)
def read_sections(filepath: str) -> list[str]:
with open(filepath, "rb") as fp:
sections = pickle.load(fp)
return sections
def load_documents(fname: str) -> pd.DataFrame:
df = pd.DataFrame()
with open(fname, "rb") as fp:
documents = pickle.load(fp)
df["documents"] = documents
return df
def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
df["n_tokens"] = df.documents.apply(lambda x: len(encoding.encode(x)))
return df
def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame:
df["embedding"] = df.documents.apply(lambda x: get_embedding(x, engine=EMBEDDING_MODEL))
return df
def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame:
# Get all documents and precompute their embeddings
df = load_documents(filepath)
df = compute_n_tokens(df)
df = precompute_embeddings(df)
df.to_csv(output_csv)
return df
if __name__ == "__main__":
root_dir = "/home/hadrien/perso/mila-docs/output/"
save_filepath = os.path.join(root_dir, "sections.pkl")
# How to write
sections = get_all_sections(root_dir)
write_sections(save_filepath, sections)
# How to load
sections = read_sections(save_filepath)
# precopmute the document embeddings
df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")
|