Anirudh1993 commited on
Commit
4f0811d
·
verified ·
1 Parent(s): ac6d4b3

Update document_chat.py

Browse files
Files changed (1) hide show
  1. document_chat.py +48 -48
document_chat.py CHANGED
@@ -1,48 +1,48 @@
1
- import os
2
- from langchain.vectorstores import Chroma
3
- from langchain.embeddings import HuggingFaceEmbeddings
4
- from langchain.document_loaders import PyMUPDFLoader
5
- from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain.chains import ConversationalRetrievalChain
7
- from langchain.memory import ConversationalBufferMemory
8
- from langchain.llms import HuggingFaceHub
9
-
10
- #Constants
11
- CHROMA_DB_PATH = "chroma_db"
12
- SENTENCE_TRANSFORMER_MODEL = "sentence-ransformers/all-MiniLM-L6=v2"
13
- LLM_Model = "HuggingFaceH4/zephyr-7b-beta"
14
-
15
- #Initialize vector store
16
- def initialize_vector_store():
17
- embeddings = HuggingFaceEmbeddings(model_name = SENTENCE_TRANSFORMER_MODEL)
18
- vector_store = Chroma(persist_directory = CHROMA_DB_PATH, embedding_fnction = embeddings)
19
- return vector_store
20
- vector_store = initialize_vector_store()
21
- def ingest_pdf(pdf_path):
22
- loader = PyMUPDFLoader(pdf_path)
23
- documents = loader.load()
24
-
25
- #split text into smaller chunks
26
- text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
27
- splitdocs = text_splitter.split_documents(documents)
28
-
29
- #store in vector db
30
- vector_store.add_documents(splitdocs)
31
- vector_store.persist()
32
-
33
- def process_query_with_memory(query, chat_history=[]):
34
- retriever = vector_store.as_retriever()
35
-
36
- #Initialize chat memory
37
- memory = ConversationalBufferMemory(memory_key = "chat_history", return_messages = True)
38
-
39
- #Load a free hugging face model
40
- llm = HuggingFaceHub(repo_id = LLM_Model, model_kwargs = {"max_new_tokens": 500})
41
-
42
- #Create a conversational retrieval chain
43
- qa_chain = ConversationalRetrievalChain(
44
- llm = llm,
45
- retriever = retriever,
46
- memory = memory)
47
- return qa_chain.run({"question":query, "chat_history": chat_history})
48
-
 
1
+ import os
2
+ from langchain.vectorstores import Chroma
3
+ from langchain.embeddings import HuggingFaceEmbeddings
4
+ from langchain.document_loaders import PyMuPDFLoader
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain.chains import ConversationalRetrievalChain
7
+ from langchain.memory import ConversationalBufferMemory
8
+ from langchain.llms import HuggingFaceHub
9
+
10
+ #Constants
11
+ CHROMA_DB_PATH = "chroma_db"
12
+ SENTENCE_TRANSFORMER_MODEL = "sentence-ransformers/all-MiniLM-L6=v2"
13
+ LLM_Model = "HuggingFaceH4/zephyr-7b-beta"
14
+
15
+ #Initialize vector store
16
+ def initialize_vector_store():
17
+ embeddings = HuggingFaceEmbeddings(model_name = SENTENCE_TRANSFORMER_MODEL)
18
+ vector_store = Chroma(persist_directory = CHROMA_DB_PATH, embedding_fnction = embeddings)
19
+ return vector_store
20
+ vector_store = initialize_vector_store()
21
+ def ingest_pdf(pdf_path):
22
+ loader = PyMUPDFLoader(pdf_path)
23
+ documents = loader.load()
24
+
25
+ #split text into smaller chunks
26
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
27
+ splitdocs = text_splitter.split_documents(documents)
28
+
29
+ #store in vector db
30
+ vector_store.add_documents(splitdocs)
31
+ vector_store.persist()
32
+
33
+ def process_query_with_memory(query, chat_history=[]):
34
+ retriever = vector_store.as_retriever()
35
+
36
+ #Initialize chat memory
37
+ memory = ConversationalBufferMemory(memory_key = "chat_history", return_messages = True)
38
+
39
+ #Load a free hugging face model
40
+ llm = HuggingFaceHub(repo_id = LLM_Model, model_kwargs = {"max_new_tokens": 500})
41
+
42
+ #Create a conversational retrieval chain
43
+ qa_chain = ConversationalRetrievalChain(
44
+ llm = llm,
45
+ retriever = retriever,
46
+ memory = memory)
47
+ return qa_chain.run({"question":query, "chat_history": chat_history})
48
+