DebabrataHalder commited on
Commit
49cf98b
·
verified ·
1 Parent(s): d947851

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -0
app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import time
4
+ from dotenv import load_dotenv
5
+ import streamlit as st
6
+ from PyPDF2 import PdfReader
7
+ from langchain.text_splitter import CharacterTextSplitter
8
+ from langchain_cohere import CohereEmbeddings
9
+ from langchain.vectorstores import FAISS
10
+ from langchain.memory import ConversationBufferMemory
11
+ from langchain.chains import ConversationalRetrievalChain
12
+ from langchain_groq import ChatGroq
13
+
14
+ # Load environment variables
15
+ load_dotenv()
16
+
17
+ # Set up logging
18
+ logging.basicConfig(
19
+ level=logging.INFO,
20
+ format="%(asctime)s - %(levelname)s - %(message)s"
21
+ )
22
+
23
+ # Function to extract text from PDF files
24
+ def get_pdf_text(pdf_docs):
25
+ text = ""
26
+ for pdf in pdf_docs:
27
+ pdf_reader = PdfReader(pdf)
28
+ for page in pdf_reader.pages:
29
+ text += page.extract_text()
30
+ return text
31
+
32
+ # Function to split the extracted text into chunks
33
+ def get_text_chunks(text):
34
+ text_splitter = CharacterTextSplitter(
35
+ separator="\n",
36
+ chunk_size=1000,
37
+ chunk_overlap=200,
38
+ length_function=len
39
+ )
40
+ chunks = text_splitter.split_text(text)
41
+ return chunks
42
+
43
+ # Function to create a FAISS vectorstore with rate-limiting and retry logic
44
+ def get_vectorstore(text_chunks):
45
+ cohere_api_key = os.getenv("COHERE_API_KEY")
46
+ embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
47
+
48
+ vectorstore = None
49
+ batch_size = 10 # Process chunks in batches of 10
50
+ for i in range(0, len(text_chunks), batch_size):
51
+ batch = text_chunks[i:i+batch_size]
52
+ retry_count = 0
53
+
54
+ while retry_count < 5: # Retry up to 5 times
55
+ try:
56
+ if vectorstore is None:
57
+ vectorstore = FAISS.from_texts(texts=batch, embedding=embeddings)
58
+ else:
59
+ vectorstore.add_texts(batch, embedding=embeddings)
60
+ break # Exit retry loop if successful
61
+ except Exception as e:
62
+ if "rate limit" in str(e).lower():
63
+ logging.warning(f"Rate limit exceeded. Retrying batch {i//batch_size + 1} in {2 ** retry_count} seconds...")
64
+ time.sleep(2 ** retry_count) # Exponential backoff
65
+ retry_count += 1
66
+ else:
67
+ raise e # Raise other errors
68
+ return vectorstore
69
+
70
+ # Function to set up the conversational retrieval chain
71
+ def get_conversation_chain(vectorstore):
72
+ try:
73
+ llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0.5)
74
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
75
+
76
+ conversation_chain = ConversationalRetrievalChain.from_llm(
77
+ llm=llm,
78
+ retriever=vectorstore.as_retriever(),
79
+ memory=memory
80
+ )
81
+
82
+ logging.info("Conversation chain created successfully.")
83
+ return conversation_chain
84
+ except Exception as e:
85
+ logging.error(f"Error creating conversation chain: {e}")
86
+ st.error("An error occurred while setting up the conversation chain.")
87
+
88
+ # Handle user input
89
+ def handle_userinput(user_question):
90
+ if st.session_state.conversation is not None:
91
+ response = st.session_state.conversation({"question": user_question})
92
+ st.session_state.chat_history = response["chat_history"]
93
+
94
+ for i, message in enumerate(st.session_state.chat_history):
95
+ if i % 2 == 0:
96
+ st.write(f"*User:* {message.content}")
97
+ else:
98
+ st.write(f"*Bot:* {message.content}")
99
+ else:
100
+ st.warning("Please process the documents first.")
101
+
102
+ # Main function to run the Streamlit app
103
+ def main():
104
+ load_dotenv()
105
+ st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
106
+
107
+ if "conversation" not in st.session_state:
108
+ st.session_state.conversation = None
109
+ if "chat_history" not in st.session_state:
110
+ st.session_state.chat_history = None
111
+
112
+ st.header("Chat with multiple PDFs :books:")
113
+ user_question = st.text_input("Ask a question about your documents:")
114
+ if user_question:
115
+ handle_userinput(user_question)
116
+
117
+ with st.sidebar:
118
+ st.subheader("Your documents")
119
+ pdf_docs = st.file_uploader(
120
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
121
+ )
122
+ if st.button("Process"):
123
+ with st.spinner("Processing..."):
124
+ raw_text = get_pdf_text(pdf_docs)
125
+ text_chunks = get_text_chunks(raw_text)
126
+ vectorstore = get_vectorstore(text_chunks)
127
+ st.session_state.conversation = get_conversation_chain(vectorstore)
128
+
129
+ if __name__ == "__main__":
130
+ main()