Anirudh1993 commited on
Commit
dd2b131
·
verified ·
1 Parent(s): 3e1a55c

Update document_chat.py

Browse files
Files changed (1) hide show
  1. document_chat.py +13 -35
document_chat.py CHANGED
@@ -6,62 +6,40 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain.chains import ConversationalRetrievalChain
7
  from langchain.memory import ConversationBufferMemory
8
  from langchain.llms import HuggingFaceHub
9
- from langchain.prompts import PromptTemplate
10
- from langchain.chains import LLMChain
11
- from langchain.chains.combine_documents import StuffDocumentsChain # Corrected import
12
 
13
  # Constants
14
  CHROMA_DB_PATH = "chroma_db"
15
- SENTENCE_TRANSFORMER_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Sentence Transformers model
16
- LLM_Model = "HuggingFaceH4/zephyr-7b-beta" # Hugging Face model for conversation
17
 
18
- # Initialize vector store with Hugging Face embeddings
19
  def initialize_vector_store():
20
  embeddings = HuggingFaceEmbeddings(model_name=SENTENCE_TRANSFORMER_MODEL)
21
- vector_store = Chroma(persist_directory=CHROMA_DB_PATH, embedding_function=embeddings) # Fixed the typo here
22
- return vector_store
23
 
24
  vector_store = initialize_vector_store()
25
 
26
- # Function to ingest and store the PDF content into the vector store
27
  def ingest_pdf(pdf_path):
 
28
  loader = PyMuPDFLoader(pdf_path)
29
  documents = loader.load()
30
 
31
- # Split text into smaller chunks for processing
32
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
33
  split_docs = text_splitter.split_documents(documents)
34
 
35
- # Store the split documents in the vector store
36
  vector_store.add_documents(split_docs)
37
  vector_store.persist()
38
 
39
- # Function to process queries with memory and a retrieval chain
40
- def process_query_with_memory(query, chat_history=[]):
41
  retriever = vector_store.as_retriever()
42
 
43
- # Initialize conversation memory to keep track of the chat history
44
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
45
-
46
- # Load the LLM model from Hugging Face
47
- llm = HuggingFaceHub(repo_id=LLM_Model, model_kwargs={"max_new_tokens": 500})
48
-
49
- # Create a PromptTemplate for the question generator
50
- question_generator_template = "Generate a question based on the user's request: {query}"
51
- question_generator = LLMChain(llm=llm, prompt=PromptTemplate(template=question_generator_template, input_variables=["query"]))
52
-
53
- # Use StuffDocumentsChain to combine the retrieved documents
54
- combine_docs_chain = StuffDocumentsChain(llm=llm) # Corrected use of StuffDocumentsChain
55
-
56
- # Create a ConversationalRetrievalChain with the loaded model and retriever
57
  qa_chain = ConversationalRetrievalChain(
58
- llm=llm,
59
  retriever=retriever,
60
- memory=memory,
61
- question_generator=question_generator,
62
- combine_docs_chain=combine_docs_chain
63
  )
64
-
65
- # Run the query with the current chat history and return the response
66
- response = qa_chain.run({"question": query, "chat_history": chat_history})
67
- return response
 
6
  from langchain.chains import ConversationalRetrievalChain
7
  from langchain.memory import ConversationBufferMemory
8
  from langchain.llms import HuggingFaceHub
 
 
 
9
 
10
  # Constants
11
  CHROMA_DB_PATH = "chroma_db"
12
+ SENTENCE_TRANSFORMER_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
13
+ LLM_MODEL = "HuggingFaceH4/zephyr-7b-beta"
14
 
15
+ # Initialize vector store
16
  def initialize_vector_store():
17
  embeddings = HuggingFaceEmbeddings(model_name=SENTENCE_TRANSFORMER_MODEL)
18
+ return Chroma(persist_directory=CHROMA_DB_PATH, embedding_function=embeddings)
 
19
 
20
  vector_store = initialize_vector_store()
21
 
 
22
  def ingest_pdf(pdf_path):
23
+ """Loads, splits, and stores PDF content in a vector database."""
24
  loader = PyMuPDFLoader(pdf_path)
25
  documents = loader.load()
26
 
27
+ # Split text into smaller chunks
28
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
29
  split_docs = text_splitter.split_documents(documents)
30
 
31
+ # Re-initialize vector store to ensure persistence
32
  vector_store.add_documents(split_docs)
33
  vector_store.persist()
34
 
35
+ def process_query_with_memory(query, chat_history):
36
+ """Processes user queries while maintaining conversational memory."""
37
  retriever = vector_store.as_retriever()
38
 
39
+ # Use session memory (should be handled in Streamlit app)
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  qa_chain = ConversationalRetrievalChain(
41
+ llm=HuggingFaceHub(repo_id=LLM_MODEL, model_kwargs={"max_new_tokens": 500}),
42
  retriever=retriever,
43
+ memory=chat_history
 
 
44
  )
45
+ return qa_chain.run({"question": query, "chat_history": chat_history.memory if chat_history else []})