DebabrataHalder commited on
Commit
7b850f8
·
verified ·
1 Parent(s): 9caf4f3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -0
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import logging
4
+ from dotenv import load_dotenv
5
+ import streamlit as st
6
+ from PyPDF2 import PdfReader
7
+ from langchain.text_splitter import CharacterTextSplitter
8
+ # from langchain.embeddings import HuggingFaceInstructEmbeddings
9
+ from langchain_cohere import CohereEmbeddings
10
+ from langchain.vectorstores import FAISS
11
+ from langchain.memory import ConversationBufferMemory
12
+ from langchain.chains import ConversationalRetrievalChain
13
+ # from langchain.llms import Ollama
14
+ from langchain_groq import ChatGroq
15
+
16
+ # Load environment variables
17
+ load_dotenv()
18
+
19
+ # Set up logging
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format='%(asctime)s - %(levelname)s - %(message)s'
23
+ )
24
+
25
+ # Function to extract text from PDF files
26
+ def get_pdf_text(pdf_docs):
27
+ text = ""
28
+ for pdf in pdf_docs:
29
+ pdf_reader = PdfReader(pdf)
30
+ for page in pdf_reader.pages:
31
+ text += page.extract_text()
32
+ return text
33
+
34
+ # Function to split the extracted text into chunks
35
+ def get_text_chunks(text):
36
+ text_splitter = CharacterTextSplitter(
37
+ separator="\n",
38
+ chunk_size=1000,
39
+ chunk_overlap=200,
40
+ length_function=len
41
+ )
42
+ chunks = text_splitter.split_text(text)
43
+ return chunks
44
+
45
+ # Function to create a FAISS vectorstore
46
+ # def get_vectorstore(text_chunks):
47
+ # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
48
+ # vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
49
+ # return vectorstore
50
+
51
+ def get_vectorstore(text_chunks):
52
+ cohere_api_key = os.getenv("COHERE_API_KEY")
53
+ embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
54
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
55
+ return vectorstore
56
+
57
+ # Function to set up the conversational retrieval chain
58
+ def get_conversation_chain(vectorstore):
59
+ try:
60
+ # llm = Ollama(model="llama3.2:1b")
61
+ llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0.5)
62
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
63
+
64
+ conversation_chain = ConversationalRetrievalChain.from_llm(
65
+ llm=llm,
66
+ retriever=vectorstore.as_retriever(),
67
+ memory=memory
68
+ )
69
+
70
+ logging.info("Conversation chain created successfully.")
71
+ return conversation_chain
72
+ except Exception as e:
73
+ logging.error(f"Error creating conversation chain: {e}")
74
+ st.error("An error occurred while setting up the conversation chain.")
75
+
76
+ # Handle user input
77
+ def handle_userinput(user_question):
78
+ if st.session_state.conversation is not None:
79
+ response = st.session_state.conversation({'question': user_question})
80
+ st.session_state.chat_history = response['chat_history']
81
+
82
+ for i, message in enumerate(st.session_state.chat_history):
83
+ if i % 2 == 0:
84
+ st.write(f"*User:* {message.content}")
85
+ else:
86
+ st.write(f"*Bot:* {message.content}")
87
+ else:
88
+ st.warning("Please process the documents first.")
89
+
90
+ # Main function to run the Streamlit app
91
+ def main():
92
+ load_dotenv()
93
+ st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
94
+
95
+ if "conversation" not in st.session_state:
96
+ st.session_state.conversation = None
97
+ if "chat_history" not in st.session_state:
98
+ st.session_state.chat_history = None
99
+
100
+ st.header("Chat with multiple PDFs :books:")
101
+ user_question = st.text_input("Ask a question about your documents:")
102
+ if user_question:
103
+ handle_userinput(user_question)
104
+
105
+ with st.sidebar:
106
+ st.subheader("Your documents")
107
+ pdf_docs = st.file_uploader(
108
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
109
+ )
110
+ if st.button("Process"):
111
+ with st.spinner("Processing..."):
112
+ raw_text = get_pdf_text(pdf_docs)
113
+ text_chunks = get_text_chunks(raw_text)
114
+ vectorstore = get_vectorstore(text_chunks)
115
+ st.session_state.conversation = get_conversation_chain(vectorstore)
116
+
117
+ if __name__ == '__main__':
118
+ main()