File size: 5,544 Bytes
775521b
 
 
 
 
 
 
 
 
 
 
8d7feb0
 
775521b
71bfdd5
775521b
 
 
 
 
 
ee4103c
775521b
 
 
 
 
b1c7fc7
 
775521b
 
 
 
 
 
 
 
 
 
 
 
ee4103c
b1c7fc7
ee4103c
775521b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d5f363
775521b
 
2d5f363
 
 
 
 
 
 
775521b
 
 
 
 
 
 
 
 
 
 
 
 
8d7feb0
775521b
 
 
 
4bbfca6
775521b
 
 
 
 
 
 
 
 
4bbfca6
775521b
 
 
 
 
8d7feb0
775521b
 
8d7feb0
775521b
ee4103c
4caf01e
 
 
ee4103c
 
 
 
8d7feb0
 
d8648b8
8d7feb0
 
 
 
 
775521b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from langchain.vectorstores import Chroma
from chromadb.api.fastapi import requests
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from llm.llmFactory import LLMFactory
from datetime import datetime
import baseInfra.dropbox_handler as dbh
from baseInfra.dbInterface import DbInterface
from uuid import UUID


class ChromaIntf():
    def __init__(self):
        self.db_interface=DbInterface()

        model_name = "BAAI/bge-large-en-v1.5"
        encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

        self.embedding = HuggingFaceBgeEmbeddings(
                model_name=model_name,
                model_kwargs={'device': 'cpu'},  
                encode_kwargs=encode_kwargs
        )

        persist_db_directory = 'db'
        persist_docs_directory = "docs"
        try:
            dbh.restoreFolder("db")
        except:
            print("Probably folder doesn't exist as it is brand new setup")
        docs = [
            Document(
                page_content="this is test doc",
                metadata={"timestamp":1696743148.474055,"ID":"test","source":"test"},
                ),
            ]

        self.vectorstore = Chroma.from_documents(documents=docs,
                                  embedding=self.embedding,
                                  persist_directory=persist_db_directory)
        #self.vectorstore._client.

        self.metadata_field_info = [
            AttributeInfo(
                name="timestamp",
                description="Python datetime.timestamp of the document in isoformat, can be used for getting date, year, month, time etc ",
                type="str",
            ),
            AttributeInfo(
                name="source",
                description="Type of entry",
                type="string or list[string]",
            ),
            ]
        self.document_content_description = "Information to store for retrival from LLM based chatbot"
        lf=LLMFactory()
        self.llm=lf.get_llm("executor2")

        self.retriever = SelfQueryRetriever.from_llm(
            self.llm,
            self.vectorstore,
            self.document_content_description,
            self.metadata_field_info,
            verbose=True
        )


    def getRelevantDocs(self,query:str,count:int=8):
        """This should also post the result to firebase"""
        print("retriver state",self.retriever.search_kwargs)
        print("retriver state",self.retriever.search_type)
        self.retriever.search_kwargs["k"]=count
        retVal=self.retriever.get_relevant_documents(query)
        value=[]
        excludeMeta=True
        try:
            for item in retVal:
                if excludeMeta:
                    v=item['page_content']+" \n"
                else:
                    v="Info:"+item['page_content']+" "
                    for key in item.metadata.keys():
                        if key != "ID":
                            v+=key+":"+str(item.metadata[key])+" "
                value.append(v)
            self.db_interface.add_to_cache(input=query,value=value)
        except:
            for item in retVal:
                v="Info:"+item.page_content+" "
                for key in item.metadata.keys():
                    if key != "ID":
                        v+=key+":"+str(item.metadata[key])+" "
                value.append(v)
            self.db_interface.add_to_cache(input=query,value=value)
        return retVal
    

    async def addText(self,inStr:str,metadata):
        metadata=metadata.dict()
        if "timestamp" not in metadata.keys():
            metadata['timestamp']=datetime.now().isoformat()
        else:
            metadata['timestamp']=datetime.fromisoformat(metadata['timestamp'])
            pass
        if "source" not in metadata.keys():
            metadata['source']="conversation"
        metadata['ID']=metadata['timestamp'].strftime("%Y-%m-%d %H:%M:%S::%f")+"-conversation"
        metadata['Year']=metadata['timestamp'].year
        metadata['Month']=metadata['timestamp'].month
        metadata['Day']=int(metadata['timestamp'].strftime("%d"))
        metadata['Hour']=metadata['timestamp'].hour
        metadata['Minute']=metadata['timestamp'].minute
        metadata['timestamp']=metadata['timestamp'].isoformat()
        #md.pop("timestamp")

        docs = [
            Document(page_content=inStr, metadata=metadata)]
        try:
            return await self.vectorstore.add_documents(docs,ids=[metadata.ID])
        except:
            print("inside expect of addText")
            return await self.vectorstore.add_documents(docs,ids=[metadata['ID']])
        
    async def listDocs(self):
        collection=self.vectorstore._client.get_collection(self.vectorstore._LANGCHAIN_DEFAULT_COLLECTION_NAME,embedding_function=self.embedding)
        return collection.get()
        #return self.vectorstore._client._get(collection_id=self._uuid(collectionInfo.id))
        
        
    async def persist(self):
        await self.vectorstore.persist()
        return await dbh.backupFolder("db")
    
    def _uuid(self,uuid_str: str) -> UUID:
        try:
            return UUID(uuid_str)
        except ValueError:
            print("Error generating uuid")
            raise ValueError(f"Could not parse {uuid_str} as a UUID")