Spaces:

jerpint
/

buster-dev

Runtime error

App Files Files Community

Marc-Antoine Rondeau commited on Feb 28, 2023

Commit

97aefb5

1 Parent(s): 71e7dd8

New db schema

Browse files

Files changed (4) hide show

buster/db/__init__.py +3 -0
buster/db/backward.py +108 -0
buster/db/documents.py +169 -0
buster/db/schema.py +135 -0

buster/db/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .documents import DocumentsDB
2	+
3	+ __all__ = [DocumentsDB]

buster/db/backward.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""Used to import existing DB as a new DB."""
+import argparse
+import itertools
+from typing import Iterable, NamedTuple
+import numpy as np
+import sqlite3
+from buster.db import DocumentsDB
+import buster.db.documents as dest
+IMPORT_QUERY = (
+    r"""SELECT source, url, title, content FROM documents WHERE current = 1 ORDER BY source, url, title, id"""
+)
+CHUNK_QUERY = r"""SELECT source, url, title, content, n_tokens, embedding FROM documents WHERE current = 1 ORDER BY source, url, id"""
+class Document(NamedTuple):
+    """Document from the original db."""
+    source: str
+    url: str
+    title: str
+    content: str
+class Section(NamedTuple):
+    """Reassemble section from the original db."""
+    url: str
+    title: str
+    content: str
+class Chunk(NamedTuple):
+    """Chunk from the original db."""
+    source: str
+    url: str
+    title: str
+    content: str
+    n_tokens: int
+    embedding: np.ndarray
+def get_documents(conn: sqlite3.Connection) -> Iterable[tuple[str, Iterable[Section]]]:
+    """Reassemble documents from the source db's chunks."""
+    documents = (Document(*row) for row in conn.execute(IMPORT_QUERY))
+    by_sources = itertools.groupby(documents, lambda doc: doc.source)
+    for source, documents in by_sources:
+        documents = itertools.groupby(documents, lambda doc: (doc.url, doc.title))
+        sections = (
+            Section(url, title, "".join(chunk.content for chunk in chunks)) for (url, title), chunks in documents
+        )
+        yield source, sections
+def get_max_size(conn: sqlite3.Connection) -> int:
+    """Get the maximum chunk size from the source db."""
+    sizes = (size for size, in conn.execute("select max(length(content)) FROM documents"))
+    (size,) = sizes
+    return size
+def get_chunks(conn: sqlite3.Connection) -> Iterable[tuple[str, Iterable[Iterable[dest.Chunk]]]]:
+    """Retrieve chunks from the source db."""
+    chunks = (Chunk(*row) for row in conn.execute(CHUNK_QUERY))
+    by_sources = itertools.groupby(chunks, lambda chunk: chunk.source)
+    for source, chunks in by_sources:
+        by_section = itertools.groupby(chunks, lambda chunk: (chunk.url, chunk.title))
+        sections = (
+            (dest.Chunk(chunk.content, chunk.n_tokens, chunk.embedding) for chunk in chunks) for _, chunks in by_section
+        )
+        yield source, sections
+def main():
+    """Import the source db into the destination db."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("source")
+    parser.add_argument("destination")
+    parser.add_argument("--size", type=int, default=2000)
+    args = parser.parse_args()
+    org = sqlite3.connect(args.source)
+    db = DocumentsDB(args.destination)
+    for source, content in get_documents(org):
+        sid, vid = db.start_version(source)
+        sections = (dest.Section(section.title, section.url, section.content) for section in content)
+        db.add_sections(sid, vid, sections)
+    size = max(args.size, get_max_size(org))
+    for source, chunks in get_chunks(org):
+        sid, vid = db.get_current_version(source)
+        cid = db.add_chunking(sid, vid, size)
+        db.add_chunks(sid, vid, cid, chunks)
+    db.conn.commit()
+    return
+if __name__ == "__main__":
+    main()

buster/db/documents.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import sqlite3
+from typing import Iterable, NamedTuple
+import warnings
+import zlib
+import numpy as np
+import pandas as pd
+import buster.db.schema as schema
+class Section(NamedTuple):
+    title: str
+    url: str
+    content: str
+    parent: int | None = None
+    type: str = "section"
+class Chunk(NamedTuple):
+    content: str
+    n_tokens: int
+    emb: np.ndarray
+class DocumentsDB:
+    """Simple SQLite database for storing documents and questions/answers.
+    The database is just a file on disk. It can store documents from different sources, and it can store multiple versions of the same document (e.g. if the document is updated).
+    Questions/answers refer to the version of the document that was used at the time.
+    Example:
+        >>> db = DocumentsDB("/path/to/the/db.db")
+        >>> db.write_documents("source", df)  # df is a DataFrame containing the documents from a given source, obtained e.g. by using buster.docparser.generate_embeddings
+        >>> df = db.get_documents("source")
+    """
+    def __init__(self, db_path: sqlite3.Connection | str):
+        if isinstance(db_path, str):
+            self.db_path = db_path
+            self.conn = sqlite3.connect(db_path)
+        else:
+            self.db_path = None
+            self.conn = db_path
+        self.cursor = self.conn.cursor()
+        schema.initialize_db(self.conn)
+        schema.setup_db(self.conn)
+    def __del__(self):
+        if self.db_path is not None:
+            self.conn.close()
+    def get_current_version(self, source: str) -> tuple[int, int]:
+        cur = self.conn.execute("SELECT source, version FROM latest_version WHERE name = ?", (source,))
+        row = cur.fetchone()
+        if row is None:
+            raise KeyError(f'"{source}" is not a known source')
+        sid, vid = row
+        return sid, vid
+    def get_source(self, source: str) -> int:
+        cur = self.conn.execute("SELECT id FROM sources WHERE name = ?", (source,))
+        row = cur.fetchone()
+        if row is not None:
+            (sid,) = row
+        else:
+            cur = self.conn.execute("INSERT INTO sources (name) VALUES (?)", (source,))
+            cur = self.conn.execute("SELECT id FROM sources WHERE name = ?", (source,))
+            row = cur.fetchone()
+            (sid,) = row
+        return sid
+    def start_version(self, source: str) -> tuple[int, int]:
+        cur = self.conn.execute("SELECT source, version FROM latest_version WHERE name = ?", (source,))
+        row = cur.fetchone()
+        if row is None:
+            sid = self.get_source(source)
+            vid = 0
+        else:
+            sid, vid = row
+            vid = vid + 1
+        self.conn.execute("INSERT INTO versions (source, version) VALUES (?, ?)", (sid, vid))
+        return sid, vid
+    def add_sections(self, sid: int, vid: int, sections: Iterable[Section]):
+        values = (
+            (sid, vid, ind, section.title, section.url, section.content, section.parent, section.type)
+            for ind, section in enumerate(sections)
+        )
+        self.conn.executemany(
+            "INSERT INTO sections "
+            "(source, version, section, title, url, content, parent, type) "
+            "VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
+            values,
+        )
+        return
+    def add_chunking(self, sid: int, vid: int, size: int, overlap: int = 0, strategy: str = "simple") -> int:
+        self.conn.execute(
+            "INSERT INTO chunkings (size, overlap, strategy, source, version) VALUES (?, ?, ?, ?, ?)",
+            (size, overlap, strategy, sid, vid),
+        )
+        cur = self.conn.execute(
+            "SELECT chunking FROM chunkings "
+            "WHERE size = ? AND overlap = ? AND strategy = ? AND source = ? AND version = ?",
+            (size, overlap, strategy, sid, vid),
+        )
+        (id,) = (id for id, in cur)
+        return id
+    def add_chunks(self, sid: int, vid: int, cid: int, sections: Iterable[Iterable[Chunk]]):
+        chunks = ((ind, jnd, chunk) for ind, section in enumerate(sections) for jnd, chunk in enumerate(section))
+        values = ((sid, vid, ind, cid, jnd, chunk.content, chunk.n_tokens, chunk.emb) for ind, jnd, chunk in chunks)
+        self.conn.executemany(
+            "INSERT INTO chunks "
+            "(source, version, section, chunking, sequence, content, n_tokens, embedding) "
+            "VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
+            values,
+        )
+        return
+    def write_documents(self, source: str, df: pd.DataFrame):
+        """Write all documents from the dataframe into the db. All previous documents from that source will be set to `current = 0`."""
+        df = df.copy()
+        # Prepare the rows
+        df["source"] = source
+        df["current"] = 1
+        columns = ["source", "title", "url", "content", "current"]
+        if "embedding" in df.columns:
+            columns.extend(
+                [
+                    "n_tokens",
+                    "embedding",
+                ]
+            )
+            # Check that the embeddings are float32
+            if not df["embedding"].iloc[0].dtype == np.float32:
+                warnings.warn(
+                    f"Embeddings are not float32, converting them to float32 from {df['embedding'].iloc[0].dtype}.",
+                    RuntimeWarning,
+                )
+                df["embedding"] = df["embedding"].apply(lambda x: x.astype(np.float32))
+            # ZLIB compress the embeddings
+            df["embedding"] = df["embedding"].apply(lambda x: sqlite3.Binary(zlib.compress(x.tobytes())))
+        data = df[columns].values.tolist()
+        # Set `current` to 0 for all previous documents from that source
+        self.cursor.execute("UPDATE documents SET current = 0 WHERE source = ?", (source,))
+        # Insert the new documents
+        insert_statement = f"INSERT INTO documents ({', '.join(columns)}) VALUES ({', '.join(['?']*len(columns))})"
+        self.cursor.executemany(insert_statement, data)
+        self.conn.commit()
+    def get_documents(self, source: str) -> pd.DataFrame:
+        """Get all current documents from a given source."""
+        # Execute the SQL statement and fetch the results
+        results = self.cursor.execute("SELECT * FROM documents WHERE source = ?", (source,))
+        rows = results.fetchall()
+        # Convert the results to a pandas DataFrame
+        df = pd.DataFrame(rows, columns=[description[0] for description in results.description])
+        return df

buster/db/schema.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import zlib
+import numpy as np
+import sqlite3
+SOURCE_TABLE = r"""CREATE TABLE IF NOT EXISTS sources (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    name TEXT NOT NULL,
+    note TEXT,
+    UNIQUE(name)
+)"""
+VERSION_TABLE = r"""CREATE TABLE IF NOT EXISTS versions (
+    source INTEGER,
+    version INTEGER,
+    parser TEXT,
+    note TEXT,
+    PRIMARY KEY (version, source, parser)
+    FOREIGN KEY (source) REFERENCES sources (id)
+)"""
+CHUNKING_TABLE = r"""CREATE TABLE IF NOT EXISTS chunkings (
+    chunking INTEGER PRIMARY KEY AUTOINCREMENT,
+    size INTEGER,
+    overlap INTEGER,
+    strategy TEXT,
+    chunker TEXT,
+    source INTEGER,
+    version INTEGER,
+    UNIQUE (size, overlap, strategy, chunker, source, version),
+    FOREIGN KEY (source, version) REFERENCES versions (source, version)
+)"""
+SECTION_TABLE = r"""CREATE TABLE IF NOT EXISTS sections (
+    source INTEGER,
+    version INTEGER,
+    section INTEGER,
+    title TEXT NOT NULL,
+    url TEXT NOT NULL,
+    content TEXT NOT NULL,
+    parent INTEGER,
+    type TEXT,
+    PRIMARY KEY (version, source, section),
+    FOREIGN KEY (source) REFERENCES versions (source),
+    FOREIGN KEY (version) REFERENCES versions (version)
+)"""
+CHUNK_TABLE = r"""CREATE TABLE IF NOT EXISTS chunks (
+    source INTEGER,
+    version INTEGER,
+    section INTEGER,
+    chunking INTEGER,
+    sequence INTEGER,
+    content TEXT NOT NULL,
+    n_tokens INTEGER,
+    embedding VECTOR,
+    PRIMARY KEY (source, version, section, chunking, sequence),
+    FOREIGN KEY (source, version, section) REFERENCES sections (source, version, section),
+    FOREIGN KEY (source, version, chunking) REFERENCES chunkings (source, version, chunking)
+)"""
+VERSION_VIEW = r"""CREATE VIEW IF NOT EXISTS latest_version (
+    name, source, version) AS
+    SELECT sources.name, versions.source, max(versions.version)
+    FROM sources INNER JOIN versions on sources.id = versions.source
+    GROUP BY sources.id
+"""
+CHUNKING_VIEW = r"""CREATE VIEW IF NOT EXISTS latest_chunking (
+    name, source, version, chunking) AS
+    SELECT name, source, version, max(chunking) FROM
+    chunkings INNER JOIN latest_version USING (source, version)
+    GROUP by source, version
+"""
+DOCUMENT_VIEW = r"""CREATE VIEW IF NOT EXISTS documents (
+    source, title, url, content, n_tokens, embedding)
+    AS SELECT latest_chunking.name, sections.title, sections.url,
+    chunks.content, chunks.n_tokens, chunks.embedding
+    FROM chunks INNER JOIN sections USING (source, version, section)
+    INNER JOIN latest_chunking USING (source, version, chunking)
+"""
+INIT_STATEMENTS = [
+    SOURCE_TABLE,
+    VERSION_TABLE,
+    CHUNKING_TABLE,
+    SECTION_TABLE,
+    CHUNK_TABLE,
+    VERSION_VIEW,
+    CHUNKING_VIEW,
+    DOCUMENT_VIEW,
+]
+def initialize_db(connection: sqlite3.Connection):
+    for statement in INIT_STATEMENTS:
+        try:
+            connection.execute(statement)
+        except sqlite3.Error as error:
+            connection.rollback()
+            raise
+    connection.commit()
+    return connection
+def adapt_vector(vector: np.ndarray) -> bytes:
+    return sqlite3.Binary(zlib.compress(vector.astype(np.float32).tobytes()))
+def convert_vector(buffer: bytes) -> np.ndarray:
+    return np.frombuffer(zlib.decompress(buffer), dtype=np.float32)
+def cosine_similarity(a: bytes, b: bytes) -> float:
+    a = convert_vector(a)
+    b = convert_vector(b)
+    a = a / np.linalg.norm(a)
+    b = b / np.linalg.norm(b)
+    dopt = 0.5 * np.dot(a, b) + 0.5
+    return float(dopt)
+def setup_db(connection: sqlite3.Connection):
+    sqlite3.register_adapter(np.ndarray, adapt_vector)
+    sqlite3.register_converter("VECTOR", convert_vector)
+    connection.create_function("sim", 2, cosine_similarity, deterministic=True)