Spaces:

jerpint
/

buster-dev

Runtime error

App Files Files Community

hbertrand

jerpint commited on Feb 22, 2023

Commit

1f22b14

unverified ·

1 Parent(s): 8b5fed9

Create SQLite db for documents (#46)

Browse files

* sqlite db

* isort

* tests

* PR

* tests

* change names

* put default empty string for source

* type warning

* change paths

* Fix tests

* add kwargs

---------

Co-authored-by: Jeremy Pinto <[email protected]>

Files changed (10) hide show

.github/workflows/tests.yaml +1 -1
buster/apps/gradio_app.ipynb +2 -2
buster/apps/slackbot.py +3 -3
buster/chatbot.py +6 -6
buster/db.py +117 -0
buster/docparser.py +28 -15
db_to_csv.ipynb +60 -0
requirements.txt +1 -0
tests/test_db.py +62 -0
tests/test_docparser.py +30 -0

.github/workflows/tests.yaml CHANGED Viewed

@@ -20,4 +20,4 @@ jobs:
         run: |
           python3 -m pip install --upgrade pip
           pip install -e .
-          # pytest

         run: |
           python3 -m pip install --upgrade pip
           pip install -e .
+          pytest

buster/apps/gradio_app.ipynb CHANGED Viewed

@@ -14,7 +14,7 @@
     "from buster.chatbot import Chatbot, ChatbotConfig\n",
     "\n",
     "hf_transformers_cfg = ChatbotConfig(\n",
-    "    documents_file=\"../data/document_embeddings_hf_transformers.tar.gz\",\n",
     "    unknown_prompt=\"This doesn't seem to be related to the huggingface library. I am not sure how to answer.\",\n",
     "    embedding_model=\"text-embedding-ada-002\",\n",
     "    top_k=3,\n",
@@ -123,7 +123,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.12"
   },
   "vscode": {
    "interpreter": {

     "from buster.chatbot import Chatbot, ChatbotConfig\n",
     "\n",
     "hf_transformers_cfg = ChatbotConfig(\n",
+    "    documents_file=\"../data/document_embeddings_huggingface.tar.gz\",\n",
     "    unknown_prompt=\"This doesn't seem to be related to the huggingface library. I am not sure how to answer.\",\n",
     "    embedding_model=\"text-embedding-ada-002\",\n",
     "    top_k=3,\n",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.9.12 (main, Apr  5 2022, 01:52:34) \n[Clang 12.0.0 ]"
   },
   "vscode": {
    "interpreter": {

buster/apps/slackbot.py CHANGED Viewed

@@ -15,7 +15,7 @@ PYTORCH_CHANNEL = "C04MEK6N882"
 HF_TRANSFORMERS_CHANNEL = "C04NJNCJWHE"
 mila_doc_cfg = ChatbotConfig(
-    documents_file="../data/document_embeddings.csv",
     unknown_prompt="This doesn't seem to be related to cluster usage.",
     embedding_model="text-embedding-ada-002",
     top_k=3,
@@ -51,7 +51,7 @@ mila_doc_cfg = ChatbotConfig(
 mila_doc_chatbot = Chatbot(mila_doc_cfg)
 orion_cfg = ChatbotConfig(
-    documents_file="../data/document_embeddings_orion.csv",
     unknown_prompt="This doesn't seem to be related to the orion library. I am not sure how to answer.",
     embedding_model="text-embedding-ada-002",
     top_k=3,
@@ -117,7 +117,7 @@ pytorch_cfg = ChatbotConfig(
 pytorch_chatbot = Chatbot(pytorch_cfg)
 hf_transformers_cfg = ChatbotConfig(
-    documents_file="../data/document_embeddings_hf_transformers.tar.gz",
     unknown_prompt="This doesn't seem to be related to the huggingface library. I am not sure how to answer.",
     embedding_model="text-embedding-ada-002",
     top_k=3,

 HF_TRANSFORMERS_CHANNEL = "C04NJNCJWHE"
 mila_doc_cfg = ChatbotConfig(
+    documents_file="../data/document_embeddings_mila.tar.gz",
     unknown_prompt="This doesn't seem to be related to cluster usage.",
     embedding_model="text-embedding-ada-002",
     top_k=3,
 mila_doc_chatbot = Chatbot(mila_doc_cfg)
 orion_cfg = ChatbotConfig(
+    documents_file="../data/document_embeddings_orion.tar.gz",
     unknown_prompt="This doesn't seem to be related to the orion library. I am not sure how to answer.",
     embedding_model="text-embedding-ada-002",
     top_k=3,
 pytorch_chatbot = Chatbot(pytorch_cfg)
 hf_transformers_cfg = ChatbotConfig(
+    documents_file="../data/document_embeddings_huggingface.tar.gz",
     unknown_prompt="This doesn't seem to be related to the huggingface library. I am not sure how to answer.",
     embedding_model="text-embedding-ada-002",
     top_k=3,

buster/chatbot.py CHANGED Viewed

@@ -123,7 +123,7 @@ class Chatbot:
     def prepare_documents(self, matched_documents: pd.DataFrame, max_words: int) -> str:
         # gather the documents in one large plaintext variable
-        documents_list = matched_documents.text.to_list()
         documents_str = " ".join(documents_list)
         # truncate the documents to fit
@@ -181,17 +181,17 @@ class Chatbot:
         """
         urls = matched_documents.url.to_list()
-        names = matched_documents.name.to_list()
         similarities = matched_documents.similarity.to_list()
         response += f"{sep}{sep}📝 Here are the sources I used to answer your question:{sep}{sep}"
-        for url, name, similarity in zip(urls, names, similarities):
             if format == "markdown":
-                response += f"[🔗 {name}]({url}), relevance: {similarity:2.3f}{sep}"
             elif format == "html":
-                response += f"<a href='{url}'>🔗 {name}</a>{sep}"
             elif format == "slack":
-                response += f"<{url}|🔗 {name}>, relevance: {similarity:2.3f}{sep}"
             else:
                 raise ValueError(f"{format} is not a valid URL format.")

     def prepare_documents(self, matched_documents: pd.DataFrame, max_words: int) -> str:
         # gather the documents in one large plaintext variable
+        documents_list = matched_documents.content.to_list()
         documents_str = " ".join(documents_list)
         # truncate the documents to fit
         """
         urls = matched_documents.url.to_list()
+        titles = matched_documents.title.to_list()
         similarities = matched_documents.similarity.to_list()
         response += f"{sep}{sep}📝 Here are the sources I used to answer your question:{sep}{sep}"
+        for url, title, similarity in zip(urls, titles, similarities):
             if format == "markdown":
+                response += f"[🔗 {title}]({url}), relevance: {similarity:2.3f}{sep}"
             elif format == "html":
+                response += f"<a href='{url}'>🔗 {title}</a>{sep}"
             elif format == "slack":
+                response += f"<{url}|🔗 {title}>, relevance: {similarity:2.3f}{sep}"
             else:
                 raise ValueError(f"{format} is not a valid URL format.")

buster/db.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import sqlite3
+import warnings
+import zlib
+import numpy as np
+import pandas as pd
+documents_table = """CREATE TABLE IF NOT EXISTS documents (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    source TEXT NOT NULL,
+    title TEXT NOT NULL,
+    url TEXT NOT NULL,
+    content TEXT NOT NULL,
+    n_tokens INTEGER,
+    embedding BLOB,
+    current INTEGER
+)"""
+qa_table = """CREATE TABLE IF NOT EXISTS qa (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    source TEXT NOT NULL,
+    prompt TEXT NOT NULL,
+    answer TEXT NOT NULL,
+    document_id_1 INTEGER,
+    document_id_2 INTEGER,
+    document_id_3 INTEGER,
+    label_question INTEGER,
+    label_answer INTEGER,
+    testset INTEGER,
+    FOREIGN KEY (document_id_1) REFERENCES documents (id),
+    FOREIGN KEY (document_id_2) REFERENCES documents (id),
+    FOREIGN KEY (document_id_3) REFERENCES documents (id)
+)"""
+class DocumentsDB:
+    """Simple SQLite database for storing documents and questions/answers.
+    The database is just a file on disk. It can store documents from different sources, and it can store multiple versions of the same document (e.g. if the document is updated).
+    Questions/answers refer to the version of the document that was used at the time.
+    Example:
+        >>> db = DocumentsDB("/path/to/the/db.db")
+        >>> db.write_documents("source", df)  # df is a DataFrame containing the documents from a given source, obtained e.g. by using buster.docparser.generate_embeddings
+        >>> df = db.get_documents("source")
+    """
+    def __init__(self, db_path):
+        self.db_path = db_path
+        self.conn = sqlite3.connect(db_path)
+        self.cursor = self.conn.cursor()
+        self.__initialize()
+    def __del__(self):
+        self.conn.close()
+    def __initialize(self):
+        """Initialize the database."""
+        self.cursor.execute(documents_table)
+        self.cursor.execute(qa_table)
+        self.conn.commit()
+    def write_documents(self, source: str, df: pd.DataFrame):
+        """Write all documents from the dataframe into the db. All previous documents from that source will be set to `current = 0`."""
+        df = df.copy()
+        # Prepare the rows
+        df["source"] = source
+        df["current"] = 1
+        columns = ["source", "title", "url", "content", "current"]
+        if "embedding" in df.columns:
+            columns.extend(
+                [
+                    "n_tokens",
+                    "embedding",
+                ]
+            )
+            # Check that the embeddings are float32
+            if not df["embedding"].iloc[0].dtype == np.float32:
+                warnings.warn(
+                    f"Embeddings are not float32, converting them to float32 from {df['embedding'].iloc[0].dtype}.",
+                    RuntimeWarning,
+                )
+                df["embedding"] = df["embedding"].apply(lambda x: x.astype(np.float32))
+            # ZLIB compress the embeddings
+            df["embedding"] = df["embedding"].apply(lambda x: sqlite3.Binary(zlib.compress(x.tobytes())))
+        data = df[columns].values.tolist()
+        # Set `current` to 0 for all previous documents from that source
+        self.cursor.execute("UPDATE documents SET current = 0 WHERE source = ?", (source,))
+        # Insert the new documents
+        insert_statement = f"INSERT INTO documents ({', '.join(columns)}) VALUES ({', '.join(['?']*len(columns))})"
+        self.cursor.executemany(insert_statement, data)
+        self.conn.commit()
+    def get_documents(self, source: str) -> pd.DataFrame:
+        """Get all current documents from a given source."""
+        # Execute the SQL statement and fetch the results
+        results = self.cursor.execute("SELECT * FROM documents WHERE source = ? AND current = 1", (source,))
+        rows = results.fetchall()
+        # Convert the results to a pandas DataFrame
+        df = pd.DataFrame(rows, columns=[description[0] for description in results.description])
+        # ZLIB decompress the embeddings
+        df["embedding"] = df["embedding"].apply(lambda x: np.frombuffer(zlib.decompress(x), dtype=np.float32).tolist())
+        # Drop the `current` column
+        df.drop(columns=["current"], inplace=True)
+        return df

buster/docparser.py CHANGED Viewed

@@ -7,6 +7,7 @@ import tiktoken
 from bs4 import BeautifulSoup
 from openai.embeddings_utils import get_embedding
 from buster.parser import HuggingfaceParser, Parser, SphinxParser
 EMBEDDING_MODEL = "text-embedding-ada-002"
@@ -19,22 +20,22 @@ PICKLE_EXTENSIONS = [".gz", ".bz2", ".zip", ".xz", ".zst", ".tar", ".tar.gz", ".
 supported_docs = {
     "mila": {
         "base_url": "https://docs.mila.quebec/",
-        "filename": "documents_mila.tar.gz",
         "parser": SphinxParser,
     },
     "orion": {
         "base_url": "https://orion.readthedocs.io/en/stable/",
-        "filename": "documents_orion.tar.gz",
         "parser": SphinxParser,
     },
     "pytorch": {
         "base_url": "https://pytorch.org/docs/stable/",
-        "filename": "documents_pytorch.tar.gz",
         "parser": SphinxParser,
     },
     "huggingface": {
         "base_url": "https://huggingface.co/docs/transformers/",
-        "filename": "documents_huggingface.tar.gz",
         "parser": HuggingfaceParser,
     },
 }
@@ -66,7 +67,7 @@ def get_all_documents(
         urls.extend(urls_file)
         names.extend(names_file)
-    documents_df = pd.DataFrame.from_dict({"name": names, "url": urls, "text": sections})
     return documents_df
@@ -75,46 +76,58 @@ def get_file_extension(filepath: str) -> str:
     return os.path.splitext(filepath)[1]
-def write_documents(filepath: str, documents_df: pd.DataFrame):
     ext = get_file_extension(filepath)
     if ext == ".csv":
         documents_df.to_csv(filepath, index=False)
     elif ext in PICKLE_EXTENSIONS:
         documents_df.to_pickle(filepath)
     else:
         raise ValueError(f"Unsupported format: {ext}.")
-def read_documents(filepath: str) -> pd.DataFrame:
     ext = get_file_extension(filepath)
     if ext == ".csv":
         df = pd.read_csv(filepath)
-        df["embedding"] = df.embedding.apply(eval).apply(np.array)
-        return df
     elif ext in PICKLE_EXTENSIONS:
-        return pd.read_pickle(filepath)
     else:
         raise ValueError(f"Unsupported format: {ext}.")
 def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
     encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
     # TODO are there unexpected consequences of allowing endoftext?
-    df["n_tokens"] = df.text.apply(lambda x: len(encoding.encode(x, allowed_special={"<|endoftext|>"})))
     return df
 def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame:
-    df["embedding"] = df.text.apply(lambda x: get_embedding(x, engine=EMBEDDING_MODEL))
     return df
-def generate_embeddings(filepath: str, output_file: str) -> pd.DataFrame:
     # Get all documents and precompute their embeddings
-    df = read_documents(filepath)
     df = compute_n_tokens(df)
     df = precompute_embeddings(df)
-    write_documents(output_file, df)
     return df

 from bs4 import BeautifulSoup
 from openai.embeddings_utils import get_embedding
+from buster.db import DocumentsDB
 from buster.parser import HuggingfaceParser, Parser, SphinxParser
 EMBEDDING_MODEL = "text-embedding-ada-002"
 supported_docs = {
     "mila": {
         "base_url": "https://docs.mila.quebec/",
+        "filename": "documents_mila.csv",
         "parser": SphinxParser,
     },
     "orion": {
         "base_url": "https://orion.readthedocs.io/en/stable/",
+        "filename": "documents_orion.csv",
         "parser": SphinxParser,
     },
     "pytorch": {
         "base_url": "https://pytorch.org/docs/stable/",
+        "filename": "documents_pytorch.csv",
         "parser": SphinxParser,
     },
     "huggingface": {
         "base_url": "https://huggingface.co/docs/transformers/",
+        "filename": "documents_huggingface.csv",
         "parser": HuggingfaceParser,
     },
 }
         urls.extend(urls_file)
         names.extend(names_file)
+    documents_df = pd.DataFrame.from_dict({"title": names, "url": urls, "content": sections})
     return documents_df
     return os.path.splitext(filepath)[1]
+def write_documents(filepath: str, documents_df: pd.DataFrame, source: str = ""):
     ext = get_file_extension(filepath)
     if ext == ".csv":
         documents_df.to_csv(filepath, index=False)
     elif ext in PICKLE_EXTENSIONS:
         documents_df.to_pickle(filepath)
+    elif ext == ".db":
+        db = DocumentsDB(filepath)
+        db.write_documents(source, documents_df)
     else:
         raise ValueError(f"Unsupported format: {ext}.")
+def read_documents(filepath: str, source: str = "") -> pd.DataFrame:
     ext = get_file_extension(filepath)
     if ext == ".csv":
         df = pd.read_csv(filepath)
+        if "embedding" in df.columns:
+            df["embedding"] = df.embedding.apply(eval).apply(np.array)
     elif ext in PICKLE_EXTENSIONS:
+        df = pd.read_pickle(filepath)
+        if "embedding" in df.columns:
+            df["embedding"] = df.embedding.apply(np.array)
+    elif ext == ".db":
+        db = DocumentsDB(filepath)
+        df = db.get_documents(source)
     else:
         raise ValueError(f"Unsupported format: {ext}.")
+    return df
 def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
     encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
     # TODO are there unexpected consequences of allowing endoftext?
+    df["n_tokens"] = df.content.apply(lambda x: len(encoding.encode(x, allowed_special={"<|endoftext|>"})))
     return df
 def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame:
+    df["embedding"] = df.content.apply(lambda x: np.asarray(get_embedding(x, engine=EMBEDDING_MODEL), dtype=np.float32))
     return df
+def generate_embeddings(filepath: str, output_file: str, source: str) -> pd.DataFrame:
     # Get all documents and precompute their embeddings
+    df = read_documents(filepath, source)
     df = compute_n_tokens(df)
     df = precompute_embeddings(df)
+    write_documents(filepath=output_file, documents_df=df, source=source)
     return df

db_to_csv.ipynb ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Example notebook on how to extract a source from the database and save it in another format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from buster.docparser import read_documents, write_documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Path to the database\n",
+    "db_path = \"documents.db\"\n",
+    "\n",
+    "# Source to extract\n",
+    "target = \"pytorch\"\n",
+    "df = read_documents(db_path, target)\n",
+    "\n",
+    "# If you want to save it as tar.gz\n",
+    "filepath = os.path.join('buster/data/', f'document_embeddings_{target}.tar.gz')\n",
+    "write_documents(filepath, target, df)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "milabot",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "9db6f4b791ef587fd310257e87896b12053c9010399595f881592a25a8a29679"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

requirements.txt CHANGED Viewed

@@ -6,6 +6,7 @@ tabulate
 tenacity
 tiktoken
 promptlayer
 openai
 # all openai[embeddings] deps, their list breaks our CI, see: https://github.com/openai/openai-python/issues/210

 tenacity
 tiktoken
 promptlayer
+pytest
 openai
 # all openai[embeddings] deps, their list breaks our CI, see: https://github.com/openai/openai-python/issues/210

tests/test_db.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import numpy as np
+import pandas as pd
+from buster.db import DocumentsDB
+def test_write_read():
+    db = DocumentsDB(":memory:")
+    data = pd.DataFrame.from_dict(
+        {
+            "title": ["test"],
+            "url": ["http://url.com"],
+            "content": ["cool text"],
+            "embedding": [np.arange(10, dtype=np.float32) - 0.3],
+            "n_tokens": [10],
+        }
+    )
+    db.write_documents(source="test", df=data)
+    db_data = db.get_documents("test")
+    assert db_data["title"].iloc[0] == data["title"].iloc[0]
+    assert db_data["url"].iloc[0] == data["url"].iloc[0]
+    assert db_data["content"].iloc[0] == data["content"].iloc[0]
+    assert np.allclose(db_data["embedding"].iloc[0], data["embedding"].iloc[0])
+    assert db_data["n_tokens"].iloc[0] == data["n_tokens"].iloc[0]
+def test_write_write_read():
+    db = DocumentsDB(":memory:")
+    data_1 = pd.DataFrame.from_dict(
+        {
+            "title": ["test"],
+            "url": ["http://url.com"],
+            "content": ["cool text"],
+            "embedding": [np.arange(10, dtype=np.float32) - 0.3],
+            "n_tokens": [10],
+        }
+    )
+    db.write_documents(source="test", df=data_1)
+    data_2 = pd.DataFrame.from_dict(
+        {
+            "title": ["other"],
+            "url": ["http://url.com/page.html"],
+            "content": ["lorem ipsum"],
+            "embedding": [np.arange(20, dtype=np.float32) / 10 - 2.3],
+            "n_tokens": [20],
+        }
+    )
+    db.write_documents(source="test", df=data_2)
+    db_data = db.get_documents("test")
+    assert len(db_data) == len(data_2)
+    assert db_data["title"].iloc[0] == data_2["title"].iloc[0]
+    assert db_data["url"].iloc[0] == data_2["url"].iloc[0]
+    assert db_data["content"].iloc[0] == data_2["content"].iloc[0]
+    assert np.allclose(db_data["embedding"].iloc[0], data_2["embedding"].iloc[0])
+    assert db_data["n_tokens"].iloc[0] == data_2["n_tokens"].iloc[0]

tests/test_docparser.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import numpy as np
+import pandas as pd
+from buster.docparser import generate_embeddings, read_documents, write_documents
+def test_generate_embeddings(tmp_path, monkeypatch):
+    # Patch the get_embedding function to return a fixed embedding
+    monkeypatch.setattr("buster.docparser.get_embedding", lambda x, engine: [-0.005, 0.0018])
+    # Create fake data
+    data = pd.DataFrame.from_dict({"title": ["test"], "url": ["http://url.com"], "content": ["cool text"]})
+    # Write the data to a file
+    filepath = tmp_path / "test_document.csv"
+    write_documents(filepath=filepath, documents_df=data, source="test")
+    # Generate embeddings, store in a file
+    output_file = tmp_path / "test_document_embeddings.tar.gz"
+    df = generate_embeddings(filepath=filepath, output_file=output_file, source="test")
+    # Read the embeddings from the file
+    read_df = read_documents(output_file, "test")
+    # Check all the values are correct across the files
+    assert df["title"].iloc[0] == data["title"].iloc[0] == read_df["title"].iloc[0]
+    assert df["url"].iloc[0] == data["url"].iloc[0] == read_df["url"].iloc[0]
+    assert df["content"].iloc[0] == data["content"].iloc[0] == read_df["content"].iloc[0]
+    assert np.allclose(df["embedding"].iloc[0], read_df["embedding"].iloc[0])
+    assert df["n_tokens"].iloc[0] == read_df["n_tokens"].iloc[0]