Spaces:
Sleeping
Sleeping
Update document_chat.py
Browse files- document_chat.py +26 -20
document_chat.py
CHANGED
@@ -7,42 +7,48 @@ from langchain.chains import ConversationalRetrievalChain
|
|
7 |
from langchain.memory import ConversationBufferMemory
|
8 |
from langchain.llms import HuggingFaceHub
|
9 |
|
10 |
-
#Constants
|
11 |
CHROMA_DB_PATH = "chroma_db"
|
12 |
-
SENTENCE_TRANSFORMER_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
13 |
-
|
14 |
|
15 |
-
#Initialize vector store
|
16 |
def initialize_vector_store():
|
17 |
-
|
18 |
-
|
|
|
19 |
return vector_store
|
|
|
20 |
vector_store = initialize_vector_store()
|
|
|
21 |
def ingest_pdf(pdf_path):
|
|
|
22 |
loader = PyMuPDFLoader(pdf_path)
|
23 |
documents = loader.load()
|
24 |
|
25 |
-
#
|
26 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size
|
27 |
-
|
28 |
|
29 |
-
#
|
30 |
-
vector_store.add_documents(
|
31 |
vector_store.persist()
|
32 |
|
33 |
def process_query_with_memory(query, chat_history=[]):
|
|
|
34 |
retriever = vector_store.as_retriever()
|
35 |
|
36 |
-
#Initialize chat memory
|
37 |
-
memory =
|
38 |
|
39 |
-
#Load a free
|
40 |
-
llm = HuggingFaceHub(repo_id
|
41 |
|
42 |
-
#Create a conversational retrieval chain
|
43 |
qa_chain = ConversationalRetrievalChain(
|
44 |
-
llm
|
45 |
-
retriever
|
46 |
-
memory
|
47 |
-
|
48 |
|
|
|
|
7 |
from langchain.memory import ConversationBufferMemory
|
8 |
from langchain.llms import HuggingFaceHub
|
9 |
|
10 |
+
# Constants
|
11 |
CHROMA_DB_PATH = "chroma_db"
|
12 |
+
SENTENCE_TRANSFORMER_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # Corrected model name
|
13 |
+
LLM_MODEL = "HuggingFaceH4/zephyr-7b-beta" # Free chatbot model from Hugging Face
|
14 |
|
15 |
+
# Initialize vector store
|
16 |
def initialize_vector_store():
|
17 |
+
"""Initialize or load ChromaDB vector store"""
|
18 |
+
embeddings = HuggingFaceEmbeddings(model_name=SENTENCE_TRANSFORMER_MODEL)
|
19 |
+
vector_store = Chroma(persist_directory=CHROMA_DB_PATH, embedding_function=embeddings)
|
20 |
return vector_store
|
21 |
+
|
22 |
vector_store = initialize_vector_store()
|
23 |
+
|
24 |
def ingest_pdf(pdf_path):
|
25 |
+
"""Processes a PDF, splits text, and stores embeddings in ChromaDB."""
|
26 |
loader = PyMuPDFLoader(pdf_path)
|
27 |
documents = loader.load()
|
28 |
|
29 |
+
# Split text into smaller chunks
|
30 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
31 |
+
split_docs = text_splitter.split_documents(documents)
|
32 |
|
33 |
+
# Store in vector database
|
34 |
+
vector_store.add_documents(split_docs)
|
35 |
vector_store.persist()
|
36 |
|
37 |
def process_query_with_memory(query, chat_history=[]):
|
38 |
+
"""Retrieves relevant document chunks and generates a conversational response."""
|
39 |
retriever = vector_store.as_retriever()
|
40 |
|
41 |
+
# Initialize chat memory
|
42 |
+
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
|
43 |
|
44 |
+
# Load a free Hugging Face model
|
45 |
+
llm = HuggingFaceHub(repo_id=LLM_MODEL, model_kwargs={"max_new_tokens": 500})
|
46 |
|
47 |
+
# Create a conversational retrieval chain
|
48 |
qa_chain = ConversationalRetrievalChain(
|
49 |
+
llm=llm,
|
50 |
+
retriever=retriever,
|
51 |
+
memory=memory
|
52 |
+
)
|
53 |
|
54 |
+
return qa_chain.run({"question": query, "chat_history": chat_history})
|