Spaces:
Sleeping
Sleeping
Update document_chat.py
Browse files- document_chat.py +28 -11
document_chat.py
CHANGED
@@ -6,10 +6,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
6 |
from langchain.chains import ConversationalRetrievalChain
|
7 |
from langchain.memory import ConversationBufferMemory
|
8 |
from langchain.llms import HuggingFaceHub
|
9 |
-
|
10 |
-
from langchain.chains.question_answering import load_qa_chain
|
11 |
-
from langchain.llms import HuggingFaceHub
|
12 |
-
from langchain.memory import ConversationBufferMemory
|
13 |
# Constants
|
14 |
CHROMA_DB_PATH = "chroma_db"
|
15 |
SENTENCE_TRANSFORMER_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
@@ -27,29 +24,49 @@ def ingest_pdf(pdf_path):
|
|
27 |
loader = PyMuPDFLoader(pdf_path)
|
28 |
documents = loader.load()
|
29 |
|
30 |
-
#
|
31 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=
|
32 |
split_docs = text_splitter.split_documents(documents)
|
33 |
|
34 |
-
#
|
35 |
vector_store.add_documents(split_docs)
|
36 |
vector_store.persist()
|
37 |
|
38 |
def process_query_with_memory(query, chat_memory):
|
39 |
"""Processes user queries while maintaining conversational memory."""
|
40 |
-
retriever = vector_store.as_retriever(search_kwargs={"k": 3}) #
|
|
|
|
|
|
|
|
|
41 |
|
42 |
# Initialize LLM
|
43 |
llm = HuggingFaceHub(repo_id=LLM_MODEL, model_kwargs={"max_new_tokens": 500})
|
44 |
|
45 |
-
# Create
|
46 |
conversation_chain = ConversationalRetrievalChain.from_llm(
|
47 |
llm=llm,
|
48 |
retriever=retriever,
|
49 |
memory=chat_memory
|
50 |
)
|
51 |
|
52 |
-
#
|
53 |
-
chat_history =
|
|
|
|
|
|
|
|
|
54 |
|
55 |
return conversation_chain.run({"question": query, "chat_history": chat_history})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
from langchain.chains import ConversationalRetrievalChain
|
7 |
from langchain.memory import ConversationBufferMemory
|
8 |
from langchain.llms import HuggingFaceHub
|
9 |
+
|
|
|
|
|
|
|
10 |
# Constants
|
11 |
CHROMA_DB_PATH = "chroma_db"
|
12 |
SENTENCE_TRANSFORMER_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
|
|
24 |
loader = PyMuPDFLoader(pdf_path)
|
25 |
documents = loader.load()
|
26 |
|
27 |
+
# Optimized text splitting: Smaller chunks, no overlap to prevent redundancy
|
28 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=0)
|
29 |
split_docs = text_splitter.split_documents(documents)
|
30 |
|
31 |
+
# Add documents to vector store and persist
|
32 |
vector_store.add_documents(split_docs)
|
33 |
vector_store.persist()
|
34 |
|
35 |
def process_query_with_memory(query, chat_memory):
|
36 |
"""Processes user queries while maintaining conversational memory."""
|
37 |
+
retriever = vector_store.as_retriever(search_kwargs={"k": 3, "score_threshold": 0.5}) # Optimized retrieval
|
38 |
+
|
39 |
+
# Debug: Print retrieved documents
|
40 |
+
retrieved_docs = retriever.get_relevant_documents(query)
|
41 |
+
print("Retrieved Docs:\n", [doc.page_content for doc in retrieved_docs])
|
42 |
|
43 |
# Initialize LLM
|
44 |
llm = HuggingFaceHub(repo_id=LLM_MODEL, model_kwargs={"max_new_tokens": 500})
|
45 |
|
46 |
+
# Create conversational retrieval chain
|
47 |
conversation_chain = ConversationalRetrievalChain.from_llm(
|
48 |
llm=llm,
|
49 |
retriever=retriever,
|
50 |
memory=chat_memory
|
51 |
)
|
52 |
|
53 |
+
# Debug: Print chat history to detect repetition
|
54 |
+
chat_history = chat_memory.load_memory_variables({}).get("chat_history", [])
|
55 |
+
print("Chat History:\n", chat_history)
|
56 |
+
|
57 |
+
# Ensure no duplicate chat history
|
58 |
+
chat_history = list(set(chat_history))
|
59 |
|
60 |
return conversation_chain.run({"question": query, "chat_history": chat_history})
|
61 |
+
|
62 |
+
# Initialize chat memory
|
63 |
+
chat_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
|
64 |
+
|
65 |
+
# Example Usage
|
66 |
+
if __name__ == "__main__":
|
67 |
+
pdf_path = "CV_Data_Science.pdf"
|
68 |
+
ingest_pdf(pdf_path)
|
69 |
+
|
70 |
+
user_query = "What are my skills in CV?"
|
71 |
+
response = process_query_with_memory(user_query, chat_memory)
|
72 |
+
print("\nChatbot Response:", response)
|