Spaces:

jerpint
/

buster-dev

Runtime error

App Files Files Community

Marc-Antoine Rondeau commited on Feb 28, 2023

Commit

fb83544

1 Parent(s): 97aefb5

Moved schema to replace previous implementation

Browse files

Files changed (5) hide show

buster/documents/sqlite.py +0 -122
buster/{db → documents/sqlite}/__init__.py +0 -0
buster/{db → documents/sqlite}/backward.py +6 -9
buster/{db → documents/sqlite}/documents.py +39 -51
buster/{db → documents/sqlite}/schema.py +2 -4

buster/documents/sqlite.py DELETED Viewed

@@ -1,122 +0,0 @@
-import sqlite3
-import warnings
-import zlib
-import numpy as np
-import pandas as pd
-from buster.documents.base import DocumentsManager
-documents_table = """CREATE TABLE IF NOT EXISTS documents (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-    source TEXT NOT NULL,
-    title TEXT NOT NULL,
-    url TEXT NOT NULL,
-    content TEXT NOT NULL,
-    n_tokens INTEGER,
-    embedding BLOB,
-    current INTEGER
-)"""
-qa_table = """CREATE TABLE IF NOT EXISTS qa (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-    source TEXT NOT NULL,
-    prompt TEXT NOT NULL,
-    answer TEXT NOT NULL,
-    document_id_1 INTEGER,
-    document_id_2 INTEGER,
-    document_id_3 INTEGER,
-    label_question INTEGER,
-    label_answer INTEGER,
-    testset INTEGER,
-    FOREIGN KEY (document_id_1) REFERENCES documents (id),
-    FOREIGN KEY (document_id_2) REFERENCES documents (id),
-    FOREIGN KEY (document_id_3) REFERENCES documents (id)
-)"""
-class DocumentsDB(DocumentsManager):
-    """Simple SQLite database for storing documents and questions/answers.
-    The database is just a file on disk. It can store documents from different sources, and it can store multiple versions of the same document (e.g. if the document is updated).
-    Questions/answers refer to the version of the document that was used at the time.
-    Example:
-        >>> db = DocumentsDB("/path/to/the/db.db")
-        >>> db.add("source", df)  # df is a DataFrame containing the documents from a given source, obtained e.g. by using buster.docparser.generate_embeddings
-        >>> df = db.get_documents("source")
-    """
-    def __init__(self, filepath: str):
-        self.db_path = filepath
-        self.conn = sqlite3.connect(filepath)
-        self.cursor = self.conn.cursor()
-        self.__initialize()
-    def __del__(self):
-        self.conn.close()
-    def __initialize(self):
-        """Initialize the database."""
-        self.cursor.execute(documents_table)
-        self.cursor.execute(qa_table)
-        self.conn.commit()
-    def add(self, source: str, df: pd.DataFrame):
-        """Write all documents from the dataframe into the db. All previous documents from that source will be set to `current = 0`."""
-        df = df.copy()
-        # Prepare the rows
-        df["source"] = source
-        df["current"] = 1
-        columns = ["source", "title", "url", "content", "current"]
-        if "embedding" in df.columns:
-            columns.extend(
-                [
-                    "n_tokens",
-                    "embedding",
-                ]
-            )
-            # Check that the embeddings are float32
-            if not df["embedding"].iloc[0].dtype == np.float32:
-                warnings.warn(
-                    f"Embeddings are not float32, converting them to float32 from {df['embedding'].iloc[0].dtype}.",
-                    RuntimeWarning,
-                )
-                df["embedding"] = df["embedding"].apply(lambda x: x.astype(np.float32))
-            # ZLIB compress the embeddings
-            df["embedding"] = df["embedding"].apply(lambda x: sqlite3.Binary(zlib.compress(x.tobytes())))
-        data = df[columns].values.tolist()
-        # Set `current` to 0 for all previous documents from that source
-        self.cursor.execute("UPDATE documents SET current = 0 WHERE source = ?", (source,))
-        # Insert the new documents
-        insert_statement = f"INSERT INTO documents ({', '.join(columns)}) VALUES ({', '.join(['?']*len(columns))})"
-        self.cursor.executemany(insert_statement, data)
-        self.conn.commit()
-    def get_documents(self, source: str) -> pd.DataFrame:
-        """Get all current documents from a given source."""
-        # Execute the SQL statement and fetch the results
-        if source is not None:
-            results = self.cursor.execute("SELECT * FROM documents WHERE source = ? AND current = 1", (source,))
-        else:
-            results = self.cursor.execute("SELECT * FROM documents WHERE current = 1")
-        rows = results.fetchall()
-        # Convert the results to a pandas DataFrame
-        df = pd.DataFrame(rows, columns=[description[0] for description in results.description])
-        # ZLIB decompress the embeddings
-        df["embedding"] = df["embedding"].apply(lambda x: np.frombuffer(zlib.decompress(x), dtype=np.float32).tolist())
-        # Drop the `current` column
-        df.drop(columns=["current"], inplace=True)
-        return df

buster/{db → documents/sqlite}/__init__.py RENAMED Viewed

File without changes

buster/{db → documents/sqlite}/backward.py RENAMED Viewed

@@ -2,15 +2,13 @@
 import argparse
 import itertools
 from typing import Iterable, NamedTuple
 import numpy as np
-import sqlite3
-from buster.db import DocumentsDB
-import buster.db.documents as dest
 IMPORT_QUERY = (
     r"""SELECT source, url, title, content FROM documents WHERE current = 1 ORDER BY source, url, title, id"""
@@ -90,15 +88,14 @@ def main():
     db = DocumentsDB(args.destination)
     for source, content in get_documents(org):
-        sid, vid = db.start_version(source)
         sections = (dest.Section(section.title, section.url, section.content) for section in content)
-        db.add_sections(sid, vid, sections)
     size = max(args.size, get_max_size(org))
     for source, chunks in get_chunks(org):
         sid, vid = db.get_current_version(source)
-        cid = db.add_chunking(sid, vid, size)
-        db.add_chunks(sid, vid, cid, chunks)
     db.conn.commit()
     return

 import argparse
 import itertools
+import sqlite3
 from typing import Iterable, NamedTuple
 import numpy as np
+import buster.documents.sqlite.documents as dest
+from buster.documents.sqlite import DocumentsDB
 IMPORT_QUERY = (
     r"""SELECT source, url, title, content FROM documents WHERE current = 1 ORDER BY source, url, title, id"""
     db = DocumentsDB(args.destination)
     for source, content in get_documents(org):
+        # sid, vid = db.start_version(source)
         sections = (dest.Section(section.title, section.url, section.content) for section in content)
+        db.add_parse(source, sections)
     size = max(args.size, get_max_size(org))
     for source, chunks in get_chunks(org):
         sid, vid = db.get_current_version(source)
+        db.add_chunking(sid, vid, size, chunks)
     db.conn.commit()
     return

buster/{db → documents/sqlite}/documents.py RENAMED Viewed

@@ -1,12 +1,15 @@
 import sqlite3
-from typing import Iterable, NamedTuple
 import warnings
 import zlib
 import numpy as np
 import pandas as pd
-import buster.db.schema as schema
 class Section(NamedTuple):
@@ -23,7 +26,7 @@ class Chunk(NamedTuple):
     emb: np.ndarray
-class DocumentsDB:
     """Simple SQLite database for storing documents and questions/answers.
     The database is just a file on disk. It can store documents from different sources, and it can store multiple versions of the same document (e.g. if the document is updated).
@@ -36,13 +39,12 @@ class DocumentsDB:
     """
     def __init__(self, db_path: sqlite3.Connection | str):
-        if isinstance(db_path, str):
             self.db_path = db_path
-            self.conn = sqlite3.connect(db_path)
         else:
             self.db_path = None
             self.conn = db_path
-        self.cursor = self.conn.cursor()
         schema.initialize_db(self.conn)
         schema.setup_db(self.conn)
@@ -51,6 +53,7 @@ class DocumentsDB:
             self.conn.close()
     def get_current_version(self, source: str) -> tuple[int, int]:
         cur = self.conn.execute("SELECT source, version FROM latest_version WHERE name = ?", (source,))
         row = cur.fetchone()
         if row is None:
@@ -59,6 +62,7 @@ class DocumentsDB:
         return sid, vid
     def get_source(self, source: str) -> int:
         cur = self.conn.execute("SELECT id FROM sources WHERE name = ?", (source,))
         row = cur.fetchone()
         if row is not None:
@@ -71,7 +75,8 @@ class DocumentsDB:
         return sid
-    def start_version(self, source: str) -> tuple[int, int]:
         cur = self.conn.execute("SELECT source, version FROM latest_version WHERE name = ?", (source,))
         row = cur.fetchone()
         if row is None:
@@ -83,7 +88,9 @@ class DocumentsDB:
         self.conn.execute("INSERT INTO versions (source, version) VALUES (?, ?)", (sid, vid))
         return sid, vid
-    def add_sections(self, sid: int, vid: int, sections: Iterable[Section]):
         values = (
             (sid, vid, ind, section.title, section.url, section.content, section.parent, section.type)
             for ind, section in enumerate(sections)
@@ -94,9 +101,10 @@ class DocumentsDB:
             "VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
             values,
         )
-        return
-    def add_chunking(self, sid: int, vid: int, size: int, overlap: int = 0, strategy: str = "simple") -> int:
         self.conn.execute(
             "INSERT INTO chunkings (size, overlap, strategy, source, version) VALUES (?, ?, ?, ?, ?)",
             (size, overlap, strategy, sid, vid),
@@ -109,7 +117,9 @@ class DocumentsDB:
         (id,) = (id for id, in cur)
         return id
-    def add_chunks(self, sid: int, vid: int, cid: int, sections: Iterable[Iterable[Chunk]]):
         chunks = ((ind, jnd, chunk) for ind, section in enumerate(sections) for jnd, chunk in enumerate(section))
         values = ((sid, vid, ind, cid, jnd, chunk.content, chunk.n_tokens, chunk.emb) for ind, jnd, chunk in chunks)
         self.conn.executemany(
@@ -118,51 +128,29 @@ class DocumentsDB:
             "VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
             values,
         )
-        return
-    def write_documents(self, source: str, df: pd.DataFrame):
-        """Write all documents from the dataframe into the db. All previous documents from that source will be set to `current = 0`."""
-        df = df.copy()
-        # Prepare the rows
-        df["source"] = source
-        df["current"] = 1
-        columns = ["source", "title", "url", "content", "current"]
-        if "embedding" in df.columns:
-            columns.extend(
-                [
-                    "n_tokens",
-                    "embedding",
-                ]
-            )
-            # Check that the embeddings are float32
-            if not df["embedding"].iloc[0].dtype == np.float32:
-                warnings.warn(
-                    f"Embeddings are not float32, converting them to float32 from {df['embedding'].iloc[0].dtype}.",
-                    RuntimeWarning,
-                )
-                df["embedding"] = df["embedding"].apply(lambda x: x.astype(np.float32))
-            # ZLIB compress the embeddings
-            df["embedding"] = df["embedding"].apply(lambda x: sqlite3.Binary(zlib.compress(x.tobytes())))
-        data = df[columns].values.tolist()
-        # Set `current` to 0 for all previous documents from that source
-        self.cursor.execute("UPDATE documents SET current = 0 WHERE source = ?", (source,))
-        # Insert the new documents
-        insert_statement = f"INSERT INTO documents ({', '.join(columns)}) VALUES ({', '.join(['?']*len(columns))})"
-        self.cursor.executemany(insert_statement, data)
-        self.conn.commit()
     def get_documents(self, source: str) -> pd.DataFrame:
         """Get all current documents from a given source."""
         # Execute the SQL statement and fetch the results
-        results = self.cursor.execute("SELECT * FROM documents WHERE source = ?", (source,))
         rows = results.fetchall()
         # Convert the results to a pandas DataFrame
         df = pd.DataFrame(rows, columns=[description[0] for description in results.description])

+import itertools
 import sqlite3
 import warnings
 import zlib
+from pathlib import Path
+from typing import Iterable, NamedTuple
 import numpy as np
 import pandas as pd
+import buster.documents.sqlite.schema as schema
+from buster.documents.base import DocumentsManager
 class Section(NamedTuple):
     emb: np.ndarray
+class DocumentsDB(DocumentsManager):
     """Simple SQLite database for storing documents and questions/answers.
     The database is just a file on disk. It can store documents from different sources, and it can store multiple versions of the same document (e.g. if the document is updated).
     """
     def __init__(self, db_path: sqlite3.Connection | str):
+        if isinstance(db_path, (str, Path)):
             self.db_path = db_path
+            self.conn = sqlite3.connect(db_path, detect_types=sqlite3.PARSE_DECLTYPES)
         else:
             self.db_path = None
             self.conn = db_path
         schema.initialize_db(self.conn)
         schema.setup_db(self.conn)
             self.conn.close()
     def get_current_version(self, source: str) -> tuple[int, int]:
+        """Get the current version of a source."""
         cur = self.conn.execute("SELECT source, version FROM latest_version WHERE name = ?", (source,))
         row = cur.fetchone()
         if row is None:
         return sid, vid
     def get_source(self, source: str) -> int:
+        """Get the id of a source."""
         cur = self.conn.execute("SELECT id FROM sources WHERE name = ?", (source,))
         row = cur.fetchone()
         if row is not None:
         return sid
+    def new_version(self, source: str) -> tuple[int, int]:
+        """Create a new version for a source."""
         cur = self.conn.execute("SELECT source, version FROM latest_version WHERE name = ?", (source,))
         row = cur.fetchone()
         if row is None:
         self.conn.execute("INSERT INTO versions (source, version) VALUES (?, ?)", (sid, vid))
         return sid, vid
+    def add_parse(self, source: str, sections: Iterable[Section]) -> tuple[int, int]:
+        """Create a new version of a source filled with parsed sections."""
+        sid, vid = self.new_version(source)
         values = (
             (sid, vid, ind, section.title, section.url, section.content, section.parent, section.type)
             for ind, section in enumerate(sections)
             "VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
             values,
         )
+        return sid, vid
+    def new_chunking(self, sid: int, vid: int, size: int, overlap: int = 0, strategy: str = "simple") -> int:
+        """Create a new chunking for a source."""
         self.conn.execute(
             "INSERT INTO chunkings (size, overlap, strategy, source, version) VALUES (?, ?, ?, ?, ?)",
             (size, overlap, strategy, sid, vid),
         (id,) = (id for id, in cur)
         return id
+    def add_chunking(self, sid: int, vid: int, size: int, sections: Iterable[Iterable[Chunk]]) -> int:
+        """Create a new chunking for a source, filled with chunks organized by section."""
+        cid = self.new_chunking(sid, vid, size)
         chunks = ((ind, jnd, chunk) for ind, section in enumerate(sections) for jnd, chunk in enumerate(section))
         values = ((sid, vid, ind, cid, jnd, chunk.content, chunk.n_tokens, chunk.emb) for ind, jnd, chunk in chunks)
         self.conn.executemany(
             "VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
             values,
         )
+        return cid
+    def add(self, source: str, df: pd.DataFrame):
+        """Write all documents from the dataframe into the db as a new version."""
+        data = sorted(df.itertuples(), key=lambda chunk: (chunk.url, chunk.title))
+        sections = []
+        size = None
+        for (url, title), chunks in itertools.groupby(data, lambda chunk: (chunk.url, chunk.title)):
+            chunks = [Chunk(chunk.content, chunk.n_tokens, chunk.embedding) for chunk in chunks]
+            _size = max(len(chunk.content) for chunk in chunks)
+            size = max(_size, size or 0)
+            content = "".join(chunk.content for chunk in chunks)
+            sections.append((Section(title, url, content), chunks))
+        sid, vid = self.add_parse(source, (section for section, _ in sections))
+        self.add_chunking(sid, vid, size, (chunks for _, chunks in sections))
     def get_documents(self, source: str) -> pd.DataFrame:
         """Get all current documents from a given source."""
         # Execute the SQL statement and fetch the results
+        results = self.conn.execute("SELECT * FROM documents WHERE source = ?", (source,))
         rows = results.fetchall()
+        print(rows[0])
         # Convert the results to a pandas DataFrame
         df = pd.DataFrame(rows, columns=[description[0] for description in results.description])

buster/{db → documents/sqlite}/schema.py RENAMED Viewed

@@ -1,9 +1,7 @@
 import zlib
 import numpy as np
-import sqlite3
 SOURCE_TABLE = r"""CREATE TABLE IF NOT EXISTS sources (
     id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -131,5 +129,5 @@ def cosine_similarity(a: bytes, b: bytes) -> float:
 def setup_db(connection: sqlite3.Connection):
     sqlite3.register_adapter(np.ndarray, adapt_vector)
-    sqlite3.register_converter("VECTOR", convert_vector)
     connection.create_function("sim", 2, cosine_similarity, deterministic=True)

+import sqlite3
 import zlib
 import numpy as np
 SOURCE_TABLE = r"""CREATE TABLE IF NOT EXISTS sources (
     id INTEGER PRIMARY KEY AUTOINCREMENT,
 def setup_db(connection: sqlite3.Connection):
     sqlite3.register_adapter(np.ndarray, adapt_vector)
+    sqlite3.register_converter("vector", convert_vector)
     connection.create_function("sim", 2, cosine_similarity, deterministic=True)