Anirudh1993 commited on
Commit
4b5f52f
·
verified ·
1 Parent(s): 6e4cda5

Update document_chat.py

Browse files
Files changed (1) hide show
  1. document_chat.py +28 -11
document_chat.py CHANGED
@@ -6,10 +6,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from langchain.chains import ConversationalRetrievalChain
7
  from langchain.memory import ConversationBufferMemory
8
  from langchain.llms import HuggingFaceHub
9
- from langchain.chains import ConversationalRetrievalChain
10
- from langchain.chains.question_answering import load_qa_chain
11
- from langchain.llms import HuggingFaceHub
12
- from langchain.memory import ConversationBufferMemory
13
  # Constants
14
  CHROMA_DB_PATH = "chroma_db"
15
  SENTENCE_TRANSFORMER_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
@@ -27,29 +24,49 @@ def ingest_pdf(pdf_path):
27
  loader = PyMuPDFLoader(pdf_path)
28
  documents = loader.load()
29
 
30
- # Split text into smaller chunks
31
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
32
  split_docs = text_splitter.split_documents(documents)
33
 
34
- # Re-initialize vector store to ensure persistence
35
  vector_store.add_documents(split_docs)
36
  vector_store.persist()
37
 
38
  def process_query_with_memory(query, chat_memory):
39
  """Processes user queries while maintaining conversational memory."""
40
- retriever = vector_store.as_retriever(search_kwargs={"k": 3}) # Limit retrieved chunks
 
 
 
 
41
 
42
  # Initialize LLM
43
  llm = HuggingFaceHub(repo_id=LLM_MODEL, model_kwargs={"max_new_tokens": 500})
44
 
45
- # Create Conversational Retrieval Chain correctly
46
  conversation_chain = ConversationalRetrievalChain.from_llm(
47
  llm=llm,
48
  retriever=retriever,
49
  memory=chat_memory
50
  )
51
 
52
- # Fix: Properly filter chat history to avoid repetition
53
- chat_history = list(set(chat_memory.load_memory_variables({}).get("chat_history", [])))
 
 
 
 
54
 
55
  return conversation_chain.run({"question": query, "chat_history": chat_history})
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  from langchain.chains import ConversationalRetrievalChain
7
  from langchain.memory import ConversationBufferMemory
8
  from langchain.llms import HuggingFaceHub
9
+
 
 
 
10
  # Constants
11
  CHROMA_DB_PATH = "chroma_db"
12
  SENTENCE_TRANSFORMER_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 
24
  loader = PyMuPDFLoader(pdf_path)
25
  documents = loader.load()
26
 
27
+ # Optimized text splitting: Smaller chunks, no overlap to prevent redundancy
28
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=0)
29
  split_docs = text_splitter.split_documents(documents)
30
 
31
+ # Add documents to vector store and persist
32
  vector_store.add_documents(split_docs)
33
  vector_store.persist()
34
 
35
  def process_query_with_memory(query, chat_memory):
36
  """Processes user queries while maintaining conversational memory."""
37
+ retriever = vector_store.as_retriever(search_kwargs={"k": 3, "score_threshold": 0.5}) # Optimized retrieval
38
+
39
+ # Debug: Print retrieved documents
40
+ retrieved_docs = retriever.get_relevant_documents(query)
41
+ print("Retrieved Docs:\n", [doc.page_content for doc in retrieved_docs])
42
 
43
  # Initialize LLM
44
  llm = HuggingFaceHub(repo_id=LLM_MODEL, model_kwargs={"max_new_tokens": 500})
45
 
46
+ # Create conversational retrieval chain
47
  conversation_chain = ConversationalRetrievalChain.from_llm(
48
  llm=llm,
49
  retriever=retriever,
50
  memory=chat_memory
51
  )
52
 
53
+ # Debug: Print chat history to detect repetition
54
+ chat_history = chat_memory.load_memory_variables({}).get("chat_history", [])
55
+ print("Chat History:\n", chat_history)
56
+
57
+ # Ensure no duplicate chat history
58
+ chat_history = list(set(chat_history))
59
 
60
  return conversation_chain.run({"question": query, "chat_history": chat_history})
61
+
62
+ # Initialize chat memory
63
+ chat_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
64
+
65
+ # Example Usage
66
+ if __name__ == "__main__":
67
+ pdf_path = "CV_Data_Science.pdf"
68
+ ingest_pdf(pdf_path)
69
+
70
+ user_query = "What are my skills in CV?"
71
+ response = process_query_with_memory(user_query, chat_memory)
72
+ print("\nChatbot Response:", response)