Spaces:

anubhav77
/

maya-persistence

Runtime error

File size: 11,152 Bytes

ee9dbc0
9a1d7f1
3e17afd
 
9a1d7f1
3e17afd
 
 
ee9dbc0
775521b
9a1d7f1
 
775521b
 
 
 
 
105ad74
775521b
 
 
 
8d7feb0
c824142
a2651c0
8d7feb0
9a1d7f1
 
775521b
a70d161
 
 
9a1d7f1
 
 
 
 
 
 
 
 
 
 
 
 
a70d161
9a1d7f1
a2651c0
9a1d7f1
775521b
 
9a1d7f1
 
 
775521b
ee4103c
9a1d7f1
 
 
775521b
 
9a1d7f1
3125f56
 
9a1d7f1
775521b
a2651c0
 
775521b
 
 
 
 
9a1d7f1
 
 
 
 
 
 
 
775521b
9a1d7f1
 
 
 
 
 
775521b
105ad74
 
 
 
 
9a1d7f1
 
 
 
775521b
 
 
105ad74
775521b
 
105ad74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
775521b
 
 
 
 
105ad74
 
 
 
 
 
 
 
 
9a1d7f1
 
 
 
 
 
 
 
775521b
 
 
 
 
 
2abd82d
9a1d7f1
775521b
 
9a1d7f1
775521b
9a1d7f1
 
a873366
 
c5c98c5
9a1d7f1
c5c98c5
9a1d7f1
a873366
 
01de2b3
71b8e1d
9a1d7f1
 
71b8e1d
9a1d7f1
 
 
01de2b3
98bff2b
01de2b3
775521b
 
2d5f363
9a1d7f1
2d5f363
9a1d7f1
2d5f363
 
9a1d7f1
775521b
01de2b3
9a1d7f1
775521b
01de2b3
775521b
cde6f0b
9a1d7f1
cde6f0b
9a1d7f1
 
cde6f0b
9a1d7f1
775521b
01de2b3
9a1d7f1
01de2b3
775521b
 
9a1d7f1
c824142
 
 
 
 
 
 
9a1d7f1
775521b
9a1d7f1
775521b
9a1d7f1
775521b
 
9a1d7f1
 
c824142
9a1d7f1
c824142
9a1d7f1
c824142
9a1d7f1
 
bb22bf8
9a1d7f1
 
 
 
 
 
 
 
 
 
 
c824142
 
9a1d7f1
 
bc7d2c2
c738eed
c824142
3125f56
 
c824142
9a1d7f1
 
 
 
 
c824142
 
9a1d7f1
 
c824142
 
775521b
9a1d7f1
 
056b42f
 
9a1d7f1
d79f98f
9a1d7f1
775521b
9a1d7f1
 
ee4103c
9a1d7f1
 
 
 
4caf01e
9a1d7f1
 
ee4103c
9e0fdcc
3125f56
 
 
9a1d7f1
 
8d7feb0

import sys

try:
    import pysqlite3

    sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
except:
    pass
import chromadb
from langchain.vectorstores import Chroma

# from chromadb.api.fastapi import requests
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.chroma import ChromaTranslator
from llm.llmFactory import LLMFactory
from datetime import datetime
import baseInfra.dropbox_handler as dbh
from baseInfra.dbInterface import DbInterface
from uuid import UUID
from langchain.text_splitter import RecursiveCharacterTextSplitter
import logging, asyncio

logger = logging.getLogger("root")


class myChromaTranslator(ChromaTranslator):
    allowed_operators = ["$and", "$or"]
    """Subset of allowed logical operators."""
    allowed_comparators = [
        "$eq",
        "$ne",
        "$gt",
        "$gte",
        "$lt",
        "$lte",
        "$contains",
        "$not_contains",
        "$in",
        "$nin",
    ]


class ChromaIntf:
    def __init__(self):
        self.db_interface = DbInterface()

        model_name = "BAAI/bge-large-en-v1.5"
        encode_kwargs = {
            "normalize_embeddings": True
        }  # set True to compute cosine similarity

        self.embedding = HuggingFaceBgeEmbeddings(
            model_name=model_name,
            model_kwargs={"device": "cpu"},
            encode_kwargs=encode_kwargs,
        )

        self.persist_db_directory = "db"
        self.persist_docs_directory = "persistence-docs"
        self.logger_file = "persistence.log"
        loop = asyncio.get_event_loop()
        try:
            loop.run_until_complete(dbh.restoreFolder(self.persist_db_directory))
            loop.run_until_complete(dbh.restoreFolder(self.persist_docs_directory))
        except:
            print("Probably folder doesn't exist as it is brand new setup")
        docs = [
            Document(
                page_content="this is test doc",
                metadata={
                    "timestamp": 1696743148.474055,
                    "ID": "2000-01-01 15:57:11::664165-test",
                    "source": "test",
                },
                id="2000-01-01 15:57:11::664165-test",
            ),
        ]

        self.vectorstore = Chroma.from_documents(
            documents=docs,
            embedding=self.embedding,
            persist_directory=self.persist_db_directory,
        )
        # self.vectorstore._client.

        # timestamp --> time when added
        # source --> notes/references/web/youtube/book/conversation, default conversation
        # title --> of document , will be conversation when source is conversation, default blank
        # author --> will default to blank
        #     "Year": 2024,
        # "Month": 1,
        # "Day": 3,
        # "Hour": 11,
        # "Minute": 29
        self.metadata_field_info = [
            AttributeInfo(
                name="timestamp",
                description="Python datetime.timestamp of the document in isoformat, should not be used for query",
                type="str",
            ),
            AttributeInfo(
                name="Year",
                description="Year from the date when the entry was added in YYYY format",
                type="int",
            ),
            AttributeInfo(
                name="Month",
                description="Month from the date when the entry was added it is from 1-12",
                type="int",
            ),
            AttributeInfo(
                name="Day",
                description="Day of month from the date-time stamp  when the entry was added, it is from 1-31",
                type="int",
            ),
            AttributeInfo(
                name="Hour",
                description="Hour from the timestamp when the entry was added",
                type="int",
            ),
            AttributeInfo(
                name="Minute",
                description="Minute from the timestamp when the entry was added",
                type="int",
            ),
            AttributeInfo(
                name="source",
                description="Type of entry",
                type="string or list[string]",
            ),
            AttributeInfo(
                name="title",
                description="Title or Subject of the entry",
                type="string",
            ),
            AttributeInfo(
                name="author",
                description="Author of the entry",
                type="string",
            ),
        ]
        self.document_content_description = (
            "Information to store for retrival from LLM based chatbot"
        )
        lf = LLMFactory()
        # self.llm=lf.get_llm("executor2")
        self.llm = lf.get_llm("executor3")

        self.retriever = SelfQueryRetriever.from_llm(
            self.llm,
            self.vectorstore,
            self.document_content_description,
            self.metadata_field_info,
            structured_query_translator=ChromaTranslator(),
            verbose=True,
        )

    async def getRelevantDocs(self, query: str, kwargs: dict):
        """This should also post the result to firebase"""
        print("retriver state", self.retriever.search_kwargs)
        print("retriver state", self.retriever.search_type)
        try:
            for key in kwargs.keys():
                if "search_type" in key:
                    self.retriever.search_type = kwargs[key]
                else:
                    self.retriever.search_kwargs[key] = kwargs[key]
        except:
            print("setting search args failed")
        print("reaching step2")
        try:
            # loop=asyncio.get_event_loop()
            retVal = self.retriever.get_relevant_documents(query)
        except Exception as ex:
            logger.exception("Exception occured:", exc_info=True)
        value = []
        excludeMeta = True
        print("reaching step3")
        print(str(len(retVal)))
        print("reaching step4")
        try:
            for item in retVal:
                if excludeMeta:
                    v = item.page_content + " \n"
                else:
                    v = "Info:" + item.page_content + " "
                    for key in item.metadata.keys():
                        if key != "ID":
                            v += key + ":" + str(item.metadata[key]) + " "
                value.append(v)
            print("reaching step5")
            self.db_interface.add_to_cache(input=query, value=value)
        except:
            print("reaching step6")
            for item in retVal:
                if excludeMeta:
                    v = item["page_content"] + " \n"
                else:
                    v = "Info:" + item["page_content"] + " "
                    for key in item["metadata"].keys():
                        if key != "ID":
                            v += key + ":" + str(item["metadata"][key]) + " "
                value.append(v)
            print("reaching step7")
            self.db_interface.add_to_cache(input=query, value=value)
        print("reaching step8")
        return retVal

    async def addText(self, inStr: str, metadata):
        # metadata expected is some of following
        # timestamp --> time when added
        # source --> notes/references/web/youtube/book/conversation, default conversation
        # title --> of document , will be conversation when source is conversation, default blank
        # author --> will default to blank

        ##TODO: Preprocess inStr to remove any html, markdown tags etc.
        metadata = metadata.dict()
        if "timestamp" not in metadata.keys():
            metadata["timestamp"] = datetime.now().isoformat()
        else:
            metadata["timestamp"] = datetime.fromisoformat(metadata["timestamp"])
            pass
        if "source" not in metadata.keys():
            metadata["source"] = "conversation"
        if "title" not in metadata.keys():
            metadata["title"] = ""
        if metadata["source"] == "conversation":
            metadata["title"] == "conversation"
        if "author" not in metadata.keys():
            metadata["author"] = ""

        # TODO: If url is present in input or when the splitting need to be done, then we'll need to change how we
        # formulate the ID and may be filename to store information
        metadata["ID"] = (
            metadata["timestamp"].strftime("%Y-%m-%d %H-%M-%S")
            + "-"
            + metadata["title"]
        )
        metadata["Year"] = metadata["timestamp"].year
        metadata["Month"] = metadata["timestamp"].month
        metadata["Day"] = int(metadata["timestamp"].strftime("%d"))
        metadata["Hour"] = metadata["timestamp"].hour
        metadata["Minute"] = metadata["timestamp"].minute
        metadata["timestamp"] = metadata["timestamp"].isoformat()
        print("Metadata is:")
        print(metadata)
        # md.pop("timestamp")
        with open("./docs/" + metadata["ID"] + ".txt", "w") as fd:
            fd.write(inStr)
            print("written to file", inStr)
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=800,
            chunk_overlap=50,
            length_function=len,
            is_separator_regex=False,
        )
        # docs = [    Document(page_content=inStr, metadata=metadata)]
        docs = text_splitter.create_documents([inStr], [metadata])
        partNumber = 0
        for doc in docs:
            if partNumber > 0:
                doc.metadata["ID"] += f"__{partNumber}"
            partNumber += 1
            print(f"{partNumber} follows:")
            print(doc)
        try:
            print(metadata["ID"])
            ids = [doc.metadata["ID"] for doc in docs]
            print("ids are:")
            print(ids)
            return await self.vectorstore.aadd_documents(docs, ids=ids)
        except Exception as ex:
            logger.exception("exception in adding", exc_info=True)
            print("inside expect of addText")
            return await self.vectorstore.aadd_documents(docs, ids=[metadata.ID])

    async def listDocs(self):
        collection = self.vectorstore._client.get_collection(
            self.vectorstore._LANGCHAIN_DEFAULT_COLLECTION_NAME,
            embedding_function=self.embedding,
        )
        return collection.get()
        # return self.vectorstore._client._get(collection_id=self._uuid(collectionInfo.id))

    async def persist(self):
        self.vectorstore.persist()
        await dbh.backupFile(self.logger_file)
        await dbh.backupFolder(self.persist_db_directory)
        return await dbh.backupFolder(self.persist_docs_directory)

    def _uuid(self, uuid_str: str) -> UUID:
        try:
            return UUID(uuid_str)
        except ValueError:
            print("Error generating uuid")
            raise ValueError(f"Could not parse {uuid_str} as a UUID")