maya-persistence / src /chromaIntf.py
anubhav77's picture
v0.1.2
4ab9cb1
raw
history blame
6.45 kB
from langchain.vectorstores import Chroma
from chromadb.api.fastapi import requests
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from llm.llmFactory import LLMFactory
from datetime import datetime
import baseInfra.dropbox_handler as dbh
from baseInfra.dbInterface import DbInterface
from uuid import UUID
class ChromaIntf():
def __init__(self):
self.db_interface=DbInterface()
model_name = "BAAI/bge-large-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
self.embedding = HuggingFaceBgeEmbeddings(
model_name=model_name,
model_kwargs={'device': 'cpu'},
encode_kwargs=encode_kwargs
)
persist_db_directory = 'db'
persist_docs_directory = "docs"
try:
dbh.restoreFolder("db")
dbh.restoreFolder("docs")
except:
print("Probably folder doesn't exist as it is brand new setup")
docs = [
Document(
page_content="this is test doc",
metadata={"timestamp":1696743148.474055,"ID":"2000-01-01 15:57:11::664165-test","source":"test"},
id="2000-01-01 15:57:11::664165-test"
),
]
self.vectorstore = Chroma.from_documents(documents=docs,
embedding=self.embedding,
persist_directory=persist_db_directory)
#self.vectorstore._client.
self.metadata_field_info = [
AttributeInfo(
name="timestamp",
description="Python datetime.timestamp of the document in isoformat, can be used for getting date, year, month, time etc ",
type="str",
),
AttributeInfo(
name="source",
description="Type of entry",
type="string or list[string]",
),
]
self.document_content_description = "Information to store for retrival from LLM based chatbot"
lf=LLMFactory()
#self.llm=lf.get_llm("executor2")
self.llm=lf.get_llm("executor3")
self.retriever = SelfQueryRetriever.from_llm(
self.llm,
self.vectorstore,
self.document_content_description,
self.metadata_field_info,
verbose=True
)
async def getRelevantDocs(self,query:str,kwargs:dict):
"""This should also post the result to firebase"""
print("retriver state",self.retriever.search_kwargs)
print("retriver state",self.retriever.search_type)
try:
for key in kwargs.keys():
if "search_type" in key:
self.retriever.search_type=kwargs[key]
else:
self.retriever.search_kwargs[key]=kwargs[key]
except:
print("setting search args failed")
retVal=await self.retriever.aget_relevant_documents(query)
value=[]
excludeMeta=True
try:
for item in retVal:
if excludeMeta:
v=item['page_content']+" \n"
else:
v="Info:"+item['page_content']+" "
for key in item.metadata.keys():
if key != "ID":
v+=key+":"+str(item.metadata[key])+" "
value.append(v)
self.db_interface.add_to_cache(input=query,value=value)
except:
for item in retVal:
if excludeMeta:
v=item.page_content+" \n"
else:
v="Info:"+item.page_content+" "
for key in item.metadata.keys():
if key != "ID":
v+=key+":"+str(item.metadata[key])+" "
value.append(v)
self.db_interface.add_to_cache(input=query,value=value)
return retVal
async def addText(self,inStr:str,metadata):
metadata=metadata.dict()
if "timestamp" not in metadata.keys():
metadata['timestamp']=datetime.now().isoformat()
else:
metadata['timestamp']=datetime.fromisoformat(metadata['timestamp'])
pass
if "source" not in metadata.keys():
metadata['source']="conversation"
#TODO: If url is present in input or when the splitting need to be done, then we'll need to change how we
# formulate the ID and may be filename to store information
metadata['ID']=metadata['timestamp'].strftime("%Y-%m-%d %H-%M-%S")+"-"+metadata['source']
metadata['Year']=metadata['timestamp'].year
metadata['Month']=metadata['timestamp'].month
metadata['Day']=int(metadata['timestamp'].strftime("%d"))
metadata['Hour']=metadata['timestamp'].hour
metadata['Minute']=metadata['timestamp'].minute
metadata['timestamp']=metadata['timestamp'].isoformat()
#md.pop("timestamp")
docs = [
Document(page_content=inStr, metadata=metadata)]
with open("./docs/"+metadata['ID']+".txt","w") as fd:
fd.write(inStr)
print("written to file", inStr)
try:
return await self.vectorstore.aadd_documents(docs,ids=[metadata['ID']])
except:
print("inside expect of addText")
return await self.vectorstore.aadd_documents(docs,ids=[metadata.ID])
async def listDocs(self):
collection=self.vectorstore._client.get_collection(self.vectorstore._LANGCHAIN_DEFAULT_COLLECTION_NAME,embedding_function=self.embedding)
return collection.get()
#return self.vectorstore._client._get(collection_id=self._uuid(collectionInfo.id))
async def persist(self):
self.vectorstore.persist()
await dbh.backupFolder("db")
return await dbh.backupFolder("docs")
def _uuid(self,uuid_str: str) -> UUID:
try:
return UUID(uuid_str)
except ValueError:
print("Error generating uuid")
raise ValueError(f"Could not parse {uuid_str} as a UUID")