Pdf_mugger / document_chat.py
Anirudh1993's picture
Update document_chat.py
e56fbf0 verified
raw
history blame
1.79 kB
import os
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.llms import HuggingFaceHub
#Constants
CHROMA_DB_PATH = "chroma_db"
SENTENCE_TRANSFORMER_MODEL = "sentence-ransformers/all-MiniLM-L6-v2"
LLM_Model = "HuggingFaceH4/zephyr-7b-beta"
#Initialize vector store
def initialize_vector_store():
embeddings = HuggingFaceEmbeddings(model_name = SENTENCE_TRANSFORMER_MODEL)
vector_store = Chroma(persist_directory = CHROMA_DB_PATH, embedding_fnction = embeddings)
return vector_store
vector_store = initialize_vector_store()
def ingest_pdf(pdf_path):
loader = PyMuPDFLoader(pdf_path)
documents = loader.load()
#split text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
splitdocs = text_splitter.split_documents(documents)
#store in vector db
vector_store.add_documents(splitdocs)
vector_store.persist()
def process_query_with_memory(query, chat_history=[]):
retriever = vector_store.as_retriever()
#Initialize chat memory
memory = ConversationalBufferMemory(memory_key = "chat_history", return_messages = True)
#Load a free hugging face model
llm = HuggingFaceHub(repo_id = LLM_Model, model_kwargs = {"max_new_tokens": 500})
#Create a conversational retrieval chain
qa_chain = ConversationalRetrievalChain(
llm = llm,
retriever = retriever,
memory = memory)
return qa_chain.run({"question":query, "chat_history": chat_history})