File size: 6,445 Bytes
775521b
 
 
 
 
 
 
 
 
 
 
8d7feb0
 
775521b
71bfdd5
775521b
 
 
 
 
 
ee4103c
775521b
 
 
 
 
b1c7fc7
 
775521b
 
bc7d2c2
775521b
 
 
 
 
89eca8d
 
775521b
 
 
 
ee4103c
b1c7fc7
ee4103c
775521b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ab9cb1
 
775521b
 
 
 
 
 
 
 
 
 
a873366
775521b
 
 
a873366
 
c5c98c5
 
 
 
a873366
 
bb22bf8
775521b
2d5f363
775521b
 
2d5f363
 
 
 
 
 
 
775521b
74ee141
775521b
 
cde6f0b
 
 
 
 
 
 
775521b
74ee141
775521b
 
 
8d7feb0
775521b
 
 
 
4bbfca6
775521b
 
 
bb22bf8
 
 
775521b
 
 
 
 
4bbfca6
775521b
 
 
 
f9ebed3
bc7d2c2
c738eed
775521b
c738eed
775521b
 
c738eed
775521b
ee4103c
4caf01e
 
 
ee4103c
 
 
9e0fdcc
ed9eaab
9e0fdcc
8d7feb0
d8648b8
8d7feb0
 
 
 
 
775521b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from langchain.vectorstores import Chroma
from chromadb.api.fastapi import requests
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from llm.llmFactory import LLMFactory
from datetime import datetime
import baseInfra.dropbox_handler as dbh
from baseInfra.dbInterface import DbInterface
from uuid import UUID


class ChromaIntf():
    def __init__(self):
        self.db_interface=DbInterface()

        model_name = "BAAI/bge-large-en-v1.5"
        encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

        self.embedding = HuggingFaceBgeEmbeddings(
                model_name=model_name,
                model_kwargs={'device': 'cpu'},  
                encode_kwargs=encode_kwargs
        )

        persist_db_directory = 'db'
        persist_docs_directory = "docs"
        try:
            dbh.restoreFolder("db")
            dbh.restoreFolder("docs")
        except:
            print("Probably folder doesn't exist as it is brand new setup")
        docs = [
            Document(
                page_content="this is test doc",
                metadata={"timestamp":1696743148.474055,"ID":"2000-01-01 15:57:11::664165-test","source":"test"},
                id="2000-01-01 15:57:11::664165-test"
                ),
            ]

        self.vectorstore = Chroma.from_documents(documents=docs,
                                  embedding=self.embedding,
                                  persist_directory=persist_db_directory)
        #self.vectorstore._client.

        self.metadata_field_info = [
            AttributeInfo(
                name="timestamp",
                description="Python datetime.timestamp of the document in isoformat, can be used for getting date, year, month, time etc ",
                type="str",
            ),
            AttributeInfo(
                name="source",
                description="Type of entry",
                type="string or list[string]",
            ),
            ]
        self.document_content_description = "Information to store for retrival from LLM based chatbot"
        lf=LLMFactory()
        #self.llm=lf.get_llm("executor2")
        self.llm=lf.get_llm("executor3")

        self.retriever = SelfQueryRetriever.from_llm(
            self.llm,
            self.vectorstore,
            self.document_content_description,
            self.metadata_field_info,
            verbose=True
        )


    async def getRelevantDocs(self,query:str,kwargs:dict):
        """This should also post the result to firebase"""
        print("retriver state",self.retriever.search_kwargs)
        print("retriver state",self.retriever.search_type)
        try:
            for key in kwargs.keys():
                if "search_type" in key:
                    self.retriever.search_type=kwargs[key]
                else:
                    self.retriever.search_kwargs[key]=kwargs[key]
        except:
            print("setting search args failed")
        retVal=await self.retriever.aget_relevant_documents(query)
        value=[]
        excludeMeta=True
        try:
            for item in retVal:
                if excludeMeta:
                    v=item['page_content']+" \n"
                else:
                    v="Info:"+item['page_content']+" "
                    for key in item.metadata.keys():
                        if key != "ID":
                            v+=key+":"+str(item.metadata[key])+" "
                value.append(v)
            self.db_interface.add_to_cache(input=query,value=value)
        except:
            for item in retVal:
                if excludeMeta:
                    v=item.page_content+" \n"
                else:
                    v="Info:"+item.page_content+" "
                    for key in item.metadata.keys():
                        if key != "ID":
                            v+=key+":"+str(item.metadata[key])+" "
                value.append(v)
            self.db_interface.add_to_cache(input=query,value=value)
        return retVal
    

    async def addText(self,inStr:str,metadata):
        metadata=metadata.dict()
        if "timestamp" not in metadata.keys():
            metadata['timestamp']=datetime.now().isoformat()
        else:
            metadata['timestamp']=datetime.fromisoformat(metadata['timestamp'])
            pass
        if "source" not in metadata.keys():
            metadata['source']="conversation"
        #TODO: If url is present in input or when the splitting need to be done, then we'll need to change how we 
        # formulate the ID and may be filename to store information
        metadata['ID']=metadata['timestamp'].strftime("%Y-%m-%d %H-%M-%S")+"-"+metadata['source']
        metadata['Year']=metadata['timestamp'].year
        metadata['Month']=metadata['timestamp'].month
        metadata['Day']=int(metadata['timestamp'].strftime("%d"))
        metadata['Hour']=metadata['timestamp'].hour
        metadata['Minute']=metadata['timestamp'].minute
        metadata['timestamp']=metadata['timestamp'].isoformat()
        #md.pop("timestamp")

        docs = [
            Document(page_content=inStr, metadata=metadata)]
        with open("./docs/"+metadata['ID']+".txt","w") as fd:
            fd.write(inStr)
            print("written to file", inStr)
        try:
            return await self.vectorstore.aadd_documents(docs,ids=[metadata['ID']])
        except:
            print("inside expect of addText")
            return await self.vectorstore.aadd_documents(docs,ids=[metadata.ID])
        
    async def listDocs(self):
        collection=self.vectorstore._client.get_collection(self.vectorstore._LANGCHAIN_DEFAULT_COLLECTION_NAME,embedding_function=self.embedding)
        return collection.get()
        #return self.vectorstore._client._get(collection_id=self._uuid(collectionInfo.id))
        
        
    async def persist(self):
        self.vectorstore.persist()
        await dbh.backupFolder("db")
        return await dbh.backupFolder("docs")
    
    def _uuid(self,uuid_str: str) -> UUID:
        try:
            return UUID(uuid_str)
        except ValueError:
            print("Error generating uuid")
            raise ValueError(f"Could not parse {uuid_str} as a UUID")