Pdf_mugger / document_chat.py
Anirudh1993's picture
Update document_chat.py
dca455a verified
raw
history blame
2.06 kB
import os
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.llms import HuggingFaceHub
# Constants
CHROMA_DB_PATH = "chroma_db"
SENTENCE_TRANSFORMER_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Corrected model name
LLM_MODEL = "HuggingFaceH4/zephyr-7b-beta" # Free chatbot model from Hugging Face
# Initialize vector store
def initialize_vector_store():
"""Initialize or load ChromaDB vector store"""
embeddings = HuggingFaceEmbeddings(model_name=SENTENCE_TRANSFORMER_MODEL)
vector_store = Chroma(persist_directory=CHROMA_DB_PATH, embedding_function=embeddings)
return vector_store
vector_store = initialize_vector_store()
def ingest_pdf(pdf_path):
"""Processes a PDF, splits text, and stores embeddings in ChromaDB."""
loader = PyMuPDFLoader(pdf_path)
documents = loader.load()
# Split text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
split_docs = text_splitter.split_documents(documents)
# Store in vector database
vector_store.add_documents(split_docs)
vector_store.persist()
def process_query_with_memory(query, chat_history=[]):
"""Retrieves relevant document chunks and generates a conversational response."""
retriever = vector_store.as_retriever()
# Initialize chat memory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# Load a free Hugging Face model
llm = HuggingFaceHub(repo_id=LLM_MODEL, model_kwargs={"max_new_tokens": 500})
# Create a conversational retrieval chain
qa_chain = ConversationalRetrievalChain(
llm=llm,
retriever=retriever,
memory=memory
)
return qa_chain.run({"question": query, "chat_history": chat_history})