Spaces:
Runtime error
Runtime error
import glob | |
import os | |
from typing import Type | |
import numpy as np | |
import pandas as pd | |
import tiktoken | |
from bs4 import BeautifulSoup | |
from openai.embeddings_utils import get_embedding | |
from buster.documents import get_documents_manager_from_extension | |
from buster.parser import HuggingfaceParser, Parser, SphinxParser | |
EMBEDDING_MODEL = "text-embedding-ada-002" | |
EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002 | |
supported_docs = { | |
"mila": { | |
"base_url": "https://docs.mila.quebec/", | |
"filename": "documents_mila.csv", | |
"parser": SphinxParser, | |
}, | |
"orion": { | |
"base_url": "https://orion.readthedocs.io/en/stable/", | |
"filename": "documents_orion.csv", | |
"parser": SphinxParser, | |
}, | |
"pytorch": { | |
"base_url": "https://pytorch.org/docs/stable/", | |
"filename": "documents_pytorch.csv", | |
"parser": SphinxParser, | |
}, | |
"huggingface": { | |
"base_url": "https://huggingface.co/docs/transformers/", | |
"filename": "documents_huggingface.csv", | |
"parser": HuggingfaceParser, | |
}, | |
} | |
def get_all_documents( | |
root_dir: str, | |
base_url: str, | |
parser_cls: Type[Parser], | |
min_section_length: int = 100, | |
max_section_length: int = 2000, | |
) -> pd.DataFrame: | |
"""Parse all HTML files in `root_dir`, and extract all sections. | |
Sections are broken into subsections if they are longer than `max_section_length`. | |
Sections correspond to `section` HTML tags that have a headerlink attached. | |
""" | |
files = glob.glob("**/*.html", root_dir=root_dir, recursive=True) | |
sections = [] | |
urls = [] | |
names = [] | |
for file in files: | |
filepath = os.path.join(root_dir, file) | |
with open(filepath, "r") as f: | |
source = f.read() | |
soup = BeautifulSoup(source, "html.parser") | |
parser = parser_cls(soup, base_url, file, min_section_length, max_section_length) | |
# sections_file, urls_file, names_file = | |
for section in parser.parse(): | |
sections.append(section.text) | |
urls.append(section.url) | |
names.append(section.name) | |
documents_df = pd.DataFrame.from_dict({"title": names, "url": urls, "content": sections}) | |
return documents_df | |
def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame: | |
encoding = tiktoken.get_encoding(EMBEDDING_ENCODING) | |
# TODO are there unexpected consequences of allowing endoftext? | |
df["n_tokens"] = df.content.apply(lambda x: len(encoding.encode(x, allowed_special={"<|endoftext|>"}))) | |
return df | |
def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame: | |
df["embedding"] = df.content.apply(lambda x: np.asarray(get_embedding(x, engine=EMBEDDING_MODEL), dtype=np.float32)) | |
return df | |
def generate_embeddings(root_dir: str, output_filepath: str, source: str) -> pd.DataFrame: | |
# Get all documents and precompute their embeddings | |
documents = get_all_documents(root_dir, supported_docs[source]["base_url"], supported_docs[source]["parser"]) | |
documents = compute_n_tokens(documents) | |
documents = precompute_embeddings(documents) | |
documents_manager = get_documents_manager_from_extension(output_filepath)(output_filepath) | |
documents_manager.add(source, documents) | |
return documents | |