Spaces:

jerpint
/

buster-dev

Runtime error

App Files Files Community

marondeau commited on Mar 1, 2023

Commit

fde3910

unverified ·

2 Parent(s): 1696c32 a3c0809

Merge pull request #59 from marondeau/schema

Browse files

New schema for the database, including structure information.

Files changed (5) hide show

buster/documents/sqlite.py +0 -122
buster/documents/sqlite/__init__.py +3 -0
buster/documents/sqlite/backward.py +105 -0
buster/documents/sqlite/documents.py +155 -0
buster/documents/sqlite/schema.py +133 -0

buster/documents/sqlite.py DELETED Viewed

@@ -1,122 +0,0 @@
-import sqlite3
-import warnings
-import zlib
-import numpy as np
-import pandas as pd
-from buster.documents.base import DocumentsManager
-documents_table = """CREATE TABLE IF NOT EXISTS documents (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-    source TEXT NOT NULL,
-    title TEXT NOT NULL,
-    url TEXT NOT NULL,
-    content TEXT NOT NULL,
-    n_tokens INTEGER,
-    embedding BLOB,
-    current INTEGER
-)"""
-qa_table = """CREATE TABLE IF NOT EXISTS qa (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-    source TEXT NOT NULL,
-    prompt TEXT NOT NULL,
-    answer TEXT NOT NULL,
-    document_id_1 INTEGER,
-    document_id_2 INTEGER,
-    document_id_3 INTEGER,
-    label_question INTEGER,
-    label_answer INTEGER,
-    testset INTEGER,
-    FOREIGN KEY (document_id_1) REFERENCES documents (id),
-    FOREIGN KEY (document_id_2) REFERENCES documents (id),
-    FOREIGN KEY (document_id_3) REFERENCES documents (id)
-)"""
-class DocumentsDB(DocumentsManager):
-    """Simple SQLite database for storing documents and questions/answers.
-    The database is just a file on disk. It can store documents from different sources, and it can store multiple versions of the same document (e.g. if the document is updated).
-    Questions/answers refer to the version of the document that was used at the time.
-    Example:
-        >>> db = DocumentsDB("/path/to/the/db.db")
-        >>> db.add("source", df)  # df is a DataFrame containing the documents from a given source, obtained e.g. by using buster.docparser.generate_embeddings
-        >>> df = db.get_documents("source")
-    """
-    def __init__(self, filepath: str):
-        self.db_path = filepath
-        self.conn = sqlite3.connect(filepath)
-        self.cursor = self.conn.cursor()
-        self.__initialize()
-    def __del__(self):
-        self.conn.close()
-    def __initialize(self):
-        """Initialize the database."""
-        self.cursor.execute(documents_table)
-        self.cursor.execute(qa_table)
-        self.conn.commit()
-    def add(self, source: str, df: pd.DataFrame):
-        """Write all documents from the dataframe into the db. All previous documents from that source will be set to `current = 0`."""
-        df = df.copy()
-        # Prepare the rows
-        df["source"] = source
-        df["current"] = 1
-        columns = ["source", "title", "url", "content", "current"]
-        if "embedding" in df.columns:
-            columns.extend(
-                [
-                    "n_tokens",
-                    "embedding",
-                ]
-            )
-            # Check that the embeddings are float32
-            if not df["embedding"].iloc[0].dtype == np.float32:
-                warnings.warn(
-                    f"Embeddings are not float32, converting them to float32 from {df['embedding'].iloc[0].dtype}.",
-                    RuntimeWarning,
-                )
-                df["embedding"] = df["embedding"].apply(lambda x: x.astype(np.float32))
-            # ZLIB compress the embeddings
-            df["embedding"] = df["embedding"].apply(lambda x: sqlite3.Binary(zlib.compress(x.tobytes())))
-        data = df[columns].values.tolist()
-        # Set `current` to 0 for all previous documents from that source
-        self.cursor.execute("UPDATE documents SET current = 0 WHERE source = ?", (source,))
-        # Insert the new documents
-        insert_statement = f"INSERT INTO documents ({', '.join(columns)}) VALUES ({', '.join(['?']*len(columns))})"
-        self.cursor.executemany(insert_statement, data)
-        self.conn.commit()
-    def get_documents(self, source: str) -> pd.DataFrame:
-        """Get all current documents from a given source."""
-        # Execute the SQL statement and fetch the results
-        if source is not None:
-            results = self.cursor.execute("SELECT * FROM documents WHERE source = ? AND current = 1", (source,))
-        else:
-            results = self.cursor.execute("SELECT * FROM documents WHERE current = 1")
-        rows = results.fetchall()
-        # Convert the results to a pandas DataFrame
-        df = pd.DataFrame(rows, columns=[description[0] for description in results.description])
-        # ZLIB decompress the embeddings
-        df["embedding"] = df["embedding"].apply(lambda x: np.frombuffer(zlib.decompress(x), dtype=np.float32).tolist())
-        # Drop the `current` column
-        df.drop(columns=["current"], inplace=True)
-        return df

buster/documents/sqlite/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .documents import DocumentsDB
2	+
3	+ __all__ = [DocumentsDB]

buster/documents/sqlite/backward.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""Used to import existing DB as a new DB."""
+import argparse
+import itertools
+import sqlite3
+from typing import Iterable, NamedTuple
+import numpy as np
+import buster.documents.sqlite.documents as dest
+from buster.documents.sqlite import DocumentsDB
+IMPORT_QUERY = (
+    r"""SELECT source, url, title, content FROM documents WHERE current = 1 ORDER BY source, url, title, id"""
+)
+CHUNK_QUERY = r"""SELECT source, url, title, content, n_tokens, embedding FROM documents WHERE current = 1 ORDER BY source, url, id"""
+class Document(NamedTuple):
+    """Document from the original db."""
+    source: str
+    url: str
+    title: str
+    content: str
+class Section(NamedTuple):
+    """Reassemble section from the original db."""
+    url: str
+    title: str
+    content: str
+class Chunk(NamedTuple):
+    """Chunk from the original db."""
+    source: str
+    url: str
+    title: str
+    content: str
+    n_tokens: int
+    embedding: np.ndarray
+def get_documents(conn: sqlite3.Connection) -> Iterable[tuple[str, Iterable[Section]]]:
+    """Reassemble documents from the source db's chunks."""
+    documents = (Document(*row) for row in conn.execute(IMPORT_QUERY))
+    by_sources = itertools.groupby(documents, lambda doc: doc.source)
+    for source, documents in by_sources:
+        documents = itertools.groupby(documents, lambda doc: (doc.url, doc.title))
+        sections = (
+            Section(url, title, "".join(chunk.content for chunk in chunks)) for (url, title), chunks in documents
+        )
+        yield source, sections
+def get_max_size(conn: sqlite3.Connection) -> int:
+    """Get the maximum chunk size from the source db."""
+    sizes = (size for size, in conn.execute("select max(length(content)) FROM documents"))
+    (size,) = sizes
+    return size
+def get_chunks(conn: sqlite3.Connection) -> Iterable[tuple[str, Iterable[Iterable[dest.Chunk]]]]:
+    """Retrieve chunks from the source db."""
+    chunks = (Chunk(*row) for row in conn.execute(CHUNK_QUERY))
+    by_sources = itertools.groupby(chunks, lambda chunk: chunk.source)
+    for source, chunks in by_sources:
+        by_section = itertools.groupby(chunks, lambda chunk: (chunk.url, chunk.title))
+        sections = (
+            (dest.Chunk(chunk.content, chunk.n_tokens, chunk.embedding) for chunk in chunks) for _, chunks in by_section
+        )
+        yield source, sections
+def main():
+    """Import the source db into the destination db."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("source")
+    parser.add_argument("destination")
+    parser.add_argument("--size", type=int, default=2000)
+    args = parser.parse_args()
+    org = sqlite3.connect(args.source)
+    db = DocumentsDB(args.destination)
+    for source, content in get_documents(org):
+        # sid, vid = db.start_version(source)
+        sections = (dest.Section(section.title, section.url, section.content) for section in content)
+        db.add_parse(source, sections)
+    size = max(args.size, get_max_size(org))
+    for source, chunks in get_chunks(org):
+        sid, vid = db.get_current_version(source)
+        db.add_chunking(sid, vid, size, chunks)
+    db.conn.commit()
+    return
+if __name__ == "__main__":
+    main()

buster/documents/sqlite/documents.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import itertools
+import sqlite3
+import warnings
+import zlib
+from pathlib import Path
+from typing import Iterable, NamedTuple
+import numpy as np
+import pandas as pd
+import buster.documents.sqlite.schema as schema
+from buster.documents.base import DocumentsManager
+class Section(NamedTuple):
+    title: str
+    url: str
+    content: str
+    parent: int | None = None
+    type: str = "section"
+class Chunk(NamedTuple):
+    content: str
+    n_tokens: int
+    emb: np.ndarray
+class DocumentsDB(DocumentsManager):
+    """Simple SQLite database for storing documents and questions/answers.
+    The database is just a file on disk. It can store documents from different sources, and it can store multiple versions of the same document (e.g. if the document is updated).
+    Questions/answers refer to the version of the document that was used at the time.
+    Example:
+        >>> db = DocumentsDB("/path/to/the/db.db")
+        >>> db.add("source", df)  # df is a DataFrame containing the documents from a given source, obtained e.g. by using buster.docparser.generate_embeddings
+        >>> df = db.get_documents("source")
+    """
+    def __init__(self, db_path: sqlite3.Connection | str):
+        if isinstance(db_path, (str, Path)):
+            self.db_path = db_path
+            self.conn = sqlite3.connect(db_path, detect_types=sqlite3.PARSE_DECLTYPES)
+        else:
+            self.db_path = None
+            self.conn = db_path
+        schema.initialize_db(self.conn)
+        schema.setup_db(self.conn)
+    def __del__(self):
+        if self.db_path is not None:
+            self.conn.close()
+    def get_current_version(self, source: str) -> tuple[int, int]:
+        """Get the current version of a source."""
+        cur = self.conn.execute("SELECT source, version FROM latest_version WHERE name = ?", (source,))
+        row = cur.fetchone()
+        if row is None:
+            raise KeyError(f'"{source}" is not a known source')
+        sid, vid = row
+        return sid, vid
+    def get_source(self, source: str) -> int:
+        """Get the id of a source."""
+        cur = self.conn.execute("SELECT id FROM sources WHERE name = ?", (source,))
+        row = cur.fetchone()
+        if row is not None:
+            (sid,) = row
+        else:
+            cur = self.conn.execute("INSERT INTO sources (name) VALUES (?)", (source,))
+            cur = self.conn.execute("SELECT id FROM sources WHERE name = ?", (source,))
+            row = cur.fetchone()
+            (sid,) = row
+        return sid
+    def new_version(self, source: str) -> tuple[int, int]:
+        """Create a new version for a source."""
+        cur = self.conn.execute("SELECT source, version FROM latest_version WHERE name = ?", (source,))
+        row = cur.fetchone()
+        if row is None:
+            sid = self.get_source(source)
+            vid = 0
+        else:
+            sid, vid = row
+            vid = vid + 1
+        self.conn.execute("INSERT INTO versions (source, version) VALUES (?, ?)", (sid, vid))
+        return sid, vid
+    def add_parse(self, source: str, sections: Iterable[Section]) -> tuple[int, int]:
+        """Create a new version of a source filled with parsed sections."""
+        sid, vid = self.new_version(source)
+        values = (
+            (sid, vid, ind, section.title, section.url, section.content, section.parent, section.type)
+            for ind, section in enumerate(sections)
+        )
+        self.conn.executemany(
+            "INSERT INTO sections "
+            "(source, version, section, title, url, content, parent, type) "
+            "VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
+            values,
+        )
+        return sid, vid
+    def new_chunking(self, sid: int, vid: int, size: int, overlap: int = 0, strategy: str = "simple") -> int:
+        """Create a new chunking for a source."""
+        self.conn.execute(
+            "INSERT INTO chunkings (size, overlap, strategy, source, version) VALUES (?, ?, ?, ?, ?)",
+            (size, overlap, strategy, sid, vid),
+        )
+        cur = self.conn.execute(
+            "SELECT chunking FROM chunkings "
+            "WHERE size = ? AND overlap = ? AND strategy = ? AND source = ? AND version = ?",
+            (size, overlap, strategy, sid, vid),
+        )
+        (id,) = (id for id, in cur)
+        return id
+    def add_chunking(self, sid: int, vid: int, size: int, sections: Iterable[Iterable[Chunk]]) -> int:
+        """Create a new chunking for a source, filled with chunks organized by section."""
+        cid = self.new_chunking(sid, vid, size)
+        chunks = ((ind, jnd, chunk) for ind, section in enumerate(sections) for jnd, chunk in enumerate(section))
+        values = ((sid, vid, ind, cid, jnd, chunk.content, chunk.n_tokens, chunk.emb) for ind, jnd, chunk in chunks)
+        self.conn.executemany(
+            "INSERT INTO chunks "
+            "(source, version, section, chunking, sequence, content, n_tokens, embedding) "
+            "VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
+            values,
+        )
+        return cid
+    def add(self, source: str, df: pd.DataFrame):
+        """Write all documents from the dataframe into the db as a new version."""
+        data = sorted(df.itertuples(), key=lambda chunk: (chunk.url, chunk.title))
+        sections = []
+        size = 0
+        for (url, title), chunks in itertools.groupby(data, lambda chunk: (chunk.url, chunk.title)):
+            chunks = [Chunk(chunk.content, chunk.n_tokens, chunk.embedding) for chunk in chunks]
+            size = max(size, max(len(chunk.content) for chunk in chunks))
+            content = "".join(chunk.content for chunk in chunks)
+            sections.append((Section(title, url, content), chunks))
+        sid, vid = self.add_parse(source, (section for section, _ in sections))
+        self.add_chunking(sid, vid, size, (chunks for _, chunks in sections))
+    def get_documents(self, source: str) -> pd.DataFrame:
+        """Get all current documents from a given source."""
+        # Execute the SQL statement and fetch the results
+        results = self.conn.execute("SELECT * FROM documents WHERE source = ?", (source,))
+        rows = results.fetchall()
+        # Convert the results to a pandas DataFrame
+        df = pd.DataFrame(rows, columns=[description[0] for description in results.description])
+        return df

buster/documents/sqlite/schema.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import sqlite3
+import zlib
+import numpy as np
+SOURCE_TABLE = r"""CREATE TABLE IF NOT EXISTS sources (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    name TEXT NOT NULL,
+    note TEXT,
+    UNIQUE(name)
+)"""
+VERSION_TABLE = r"""CREATE TABLE IF NOT EXISTS versions (
+    source INTEGER,
+    version INTEGER,
+    parser TEXT,
+    note TEXT,
+    PRIMARY KEY (version, source, parser)
+    FOREIGN KEY (source) REFERENCES sources (id)
+)"""
+CHUNKING_TABLE = r"""CREATE TABLE IF NOT EXISTS chunkings (
+    chunking INTEGER PRIMARY KEY AUTOINCREMENT,
+    size INTEGER,
+    overlap INTEGER,
+    strategy TEXT,
+    chunker TEXT,
+    source INTEGER,
+    version INTEGER,
+    UNIQUE (size, overlap, strategy, chunker, source, version),
+    FOREIGN KEY (source, version) REFERENCES versions (source, version)
+)"""
+SECTION_TABLE = r"""CREATE TABLE IF NOT EXISTS sections (
+    source INTEGER,
+    version INTEGER,
+    section INTEGER,
+    title TEXT NOT NULL,
+    url TEXT NOT NULL,
+    content TEXT NOT NULL,
+    parent INTEGER,
+    type TEXT,
+    PRIMARY KEY (version, source, section),
+    FOREIGN KEY (source) REFERENCES versions (source),
+    FOREIGN KEY (version) REFERENCES versions (version)
+)"""
+CHUNK_TABLE = r"""CREATE TABLE IF NOT EXISTS chunks (
+    source INTEGER,
+    version INTEGER,
+    section INTEGER,
+    chunking INTEGER,
+    sequence INTEGER,
+    content TEXT NOT NULL,
+    n_tokens INTEGER,
+    embedding VECTOR,
+    PRIMARY KEY (source, version, section, chunking, sequence),
+    FOREIGN KEY (source, version, section) REFERENCES sections (source, version, section),
+    FOREIGN KEY (source, version, chunking) REFERENCES chunkings (source, version, chunking)
+)"""
+VERSION_VIEW = r"""CREATE VIEW IF NOT EXISTS latest_version (
+    name, source, version) AS
+    SELECT sources.name, versions.source, max(versions.version)
+    FROM sources INNER JOIN versions on sources.id = versions.source
+    GROUP BY sources.id
+"""
+CHUNKING_VIEW = r"""CREATE VIEW IF NOT EXISTS latest_chunking (
+    name, source, version, chunking) AS
+    SELECT name, source, version, max(chunking) FROM
+    chunkings INNER JOIN latest_version USING (source, version)
+    GROUP by source, version
+"""
+DOCUMENT_VIEW = r"""CREATE VIEW IF NOT EXISTS documents (
+    source, title, url, content, n_tokens, embedding)
+    AS SELECT latest_chunking.name, sections.title, sections.url,
+    chunks.content, chunks.n_tokens, chunks.embedding
+    FROM chunks INNER JOIN sections USING (source, version, section)
+    INNER JOIN latest_chunking USING (source, version, chunking)
+"""
+INIT_STATEMENTS = [
+    SOURCE_TABLE,
+    VERSION_TABLE,
+    CHUNKING_TABLE,
+    SECTION_TABLE,
+    CHUNK_TABLE,
+    VERSION_VIEW,
+    CHUNKING_VIEW,
+    DOCUMENT_VIEW,
+]
+def initialize_db(connection: sqlite3.Connection):
+    for statement in INIT_STATEMENTS:
+        try:
+            connection.execute(statement)
+        except sqlite3.Error as error:
+            connection.rollback()
+            raise
+    connection.commit()
+    return connection
+def adapt_vector(vector: np.ndarray) -> bytes:
+    return sqlite3.Binary(zlib.compress(vector.astype(np.float32).tobytes()))
+def convert_vector(buffer: bytes) -> np.ndarray:
+    return np.frombuffer(zlib.decompress(buffer), dtype=np.float32)
+def cosine_similarity(a: bytes, b: bytes) -> float:
+    a = convert_vector(a)
+    b = convert_vector(b)
+    a = a / np.linalg.norm(a)
+    b = b / np.linalg.norm(b)
+    dopt = 0.5 * np.dot(a, b) + 0.5
+    return float(dopt)
+def setup_db(connection: sqlite3.Connection):
+    sqlite3.register_adapter(np.ndarray, adapt_vector)
+    sqlite3.register_converter("vector", convert_vector)
+    connection.create_function("sim", 2, cosine_similarity, deterministic=True)