Spaces:
Runtime error
Runtime error
from langchain.vectorstores import Chroma | |
from chromadb.api.fastapi import requests | |
from langchain.schema import Document | |
from langchain.chains import RetrievalQA | |
from langchain.embeddings import HuggingFaceBgeEmbeddings | |
from langchain.retrievers.self_query.base import SelfQueryRetriever | |
from langchain.chains.query_constructor.base import AttributeInfo | |
from llm.llmFactory import LLMFactory | |
from datetime import datetime | |
import baseInfra.dropbox_handler as dbh | |
from baseInfra.dbInterface import DbInterface | |
from uuid import UUID | |
class ChromaIntf(): | |
def __init__(self): | |
self.db_interface=DbInterface() | |
model_name = "BAAI/bge-large-en-v1.5" | |
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity | |
self.embedding = HuggingFaceBgeEmbeddings( | |
model_name=model_name, | |
model_kwargs={'device': 'cpu'}, | |
encode_kwargs=encode_kwargs | |
) | |
persist_db_directory = 'db' | |
persist_docs_directory = "docs" | |
try: | |
dbh.restoreFolder("db") | |
dbh.restoreFolder("docs") | |
except: | |
print("Probably folder doesn't exist as it is brand new setup") | |
docs = [ | |
Document( | |
page_content="this is test doc", | |
metadata={"timestamp":1696743148.474055,"ID":"2000-01-01 15:57:11::664165-test","source":"test"}, | |
id="2000-01-01 15:57:11::664165-test" | |
), | |
] | |
self.vectorstore = Chroma.from_documents(documents=docs, | |
embedding=self.embedding, | |
persist_directory=persist_db_directory) | |
#self.vectorstore._client. | |
self.metadata_field_info = [ | |
AttributeInfo( | |
name="timestamp", | |
description="Python datetime.timestamp of the document in isoformat, can be used for getting date, year, month, time etc ", | |
type="str", | |
), | |
AttributeInfo( | |
name="source", | |
description="Type of entry", | |
type="string or list[string]", | |
), | |
] | |
self.document_content_description = "Information to store for retrival from LLM based chatbot" | |
lf=LLMFactory() | |
#self.llm=lf.get_llm("executor2") | |
self.llm=lf.get_llm("executor3") | |
self.retriever = SelfQueryRetriever.from_llm( | |
self.llm, | |
self.vectorstore, | |
self.document_content_description, | |
self.metadata_field_info, | |
verbose=True | |
) | |
async def getRelevantDocs(self,query:str,kwargs:dict): | |
"""This should also post the result to firebase""" | |
print("retriver state",self.retriever.search_kwargs) | |
print("retriver state",self.retriever.search_type) | |
try: | |
for key in kwargs.keys(): | |
if "search_type" in key: | |
self.retriever.search_type=kwargs[key] | |
else: | |
self.retriever.search_kwargs[key]=kwargs[key] | |
except: | |
print("setting search args failed") | |
retVal=await self.retriever.aget_relevant_documents(query) | |
value=[] | |
excludeMeta=True | |
try: | |
for item in retVal: | |
if excludeMeta: | |
v=item['page_content']+" \n" | |
else: | |
v="Info:"+item['page_content']+" " | |
for key in item.metadata.keys(): | |
if key != "ID": | |
v+=key+":"+str(item.metadata[key])+" " | |
value.append(v) | |
self.db_interface.add_to_cache(input=query,value=value) | |
except: | |
for item in retVal: | |
if excludeMeta: | |
v=item.page_content+" \n" | |
else: | |
v="Info:"+item.page_content+" " | |
for key in item.metadata.keys(): | |
if key != "ID": | |
v+=key+":"+str(item.metadata[key])+" " | |
value.append(v) | |
self.db_interface.add_to_cache(input=query,value=value) | |
return retVal | |
async def addText(self,inStr:str,metadata): | |
metadata=metadata.dict() | |
if "timestamp" not in metadata.keys(): | |
metadata['timestamp']=datetime.now().isoformat() | |
else: | |
metadata['timestamp']=datetime.fromisoformat(metadata['timestamp']) | |
pass | |
if "source" not in metadata.keys(): | |
metadata['source']="conversation" | |
#TODO: If url is present in input or when the splitting need to be done, then we'll need to change how we | |
# formulate the ID and may be filename to store information | |
metadata['ID']=metadata['timestamp'].strftime("%Y-%m-%d %H-%M-%S")+"-"+metadata['source'] | |
metadata['Year']=metadata['timestamp'].year | |
metadata['Month']=metadata['timestamp'].month | |
metadata['Day']=int(metadata['timestamp'].strftime("%d")) | |
metadata['Hour']=metadata['timestamp'].hour | |
metadata['Minute']=metadata['timestamp'].minute | |
metadata['timestamp']=metadata['timestamp'].isoformat() | |
#md.pop("timestamp") | |
docs = [ | |
Document(page_content=inStr, metadata=metadata)] | |
with open("./docs/"+metadata['ID']+".txt","w") as fd: | |
fd.write(inStr) | |
print("written to file", inStr) | |
try: | |
return await self.vectorstore.aadd_documents(docs,ids=[metadata['ID']]) | |
except: | |
print("inside expect of addText") | |
return await self.vectorstore.aadd_documents(docs,ids=[metadata.ID]) | |
async def listDocs(self): | |
collection=self.vectorstore._client.get_collection(self.vectorstore._LANGCHAIN_DEFAULT_COLLECTION_NAME,embedding_function=self.embedding) | |
return collection.get() | |
#return self.vectorstore._client._get(collection_id=self._uuid(collectionInfo.id)) | |
async def persist(self): | |
self.vectorstore.persist() | |
await dbh.backupFolder("db") | |
return await dbh.backupFolder("docs") | |
def _uuid(self,uuid_str: str) -> UUID: | |
try: | |
return UUID(uuid_str) | |
except ValueError: | |
print("Error generating uuid") | |
raise ValueError(f"Could not parse {uuid_str} as a UUID") | |