Spaces:

DebabrataHalder
/

chatWithMultiplePDF1

Sleeping

App Files Files Community

DebabrataHalder commited on May 1

Commit

1a40686

verified ·

1 Parent(s): f27bb1d

Update app.py

Browse files

Files changed (1) hide show

app.py +232 -103

app.py CHANGED Viewed

@@ -1,117 +1,246 @@
 import os
-import logging
 from dotenv import load_dotenv
-import streamlit as st
-from PyPDF2 import PdfReader
-from langchain.text_splitter import CharacterTextSplitter
-# from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain_cohere import CohereEmbeddings
-from langchain.vectorstores import FAISS
-from langchain.memory import ConversationBufferMemory
-from langchain.chains import ConversationalRetrievalChain
-# from langchain.llms import Ollama
-from langchain_groq import ChatGroq
 # Load environment variables
 load_dotenv()
-# Set up logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s'
-)
-# Function to extract text from PDF files
-def get_pdf_text(pdf_docs):
     text = ""
-    for pdf in pdf_docs:
-        pdf_reader = PdfReader(pdf)
-        for page in pdf_reader.pages:
-            text += page.extract_text()
-    return text
-# Function to split the extracted text into chunks
-def get_text_chunks(text):
-    text_splitter = CharacterTextSplitter(
-        separator="\n",
         chunk_size=1000,
-        chunk_overlap=200,
-        length_function=len
     )
     chunks = text_splitter.split_text(text)
-    return chunks
-# Function to create a FAISS vectorstore
-# def get_vectorstore(text_chunks):
-#     embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
-#     vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
-#     return vectorstore
-def get_vectorstore(text_chunks):
-    cohere_api_key = os.getenv("COHERE_API_KEY")
-    embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
-    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
-    return vectorstore
-# Function to set up the conversational retrieval chain
-def get_conversation_chain(vectorstore):
-    try:
-        # llm = Ollama(model="llama3.2:1b")
-        llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.5)
-        memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
-        conversation_chain = ConversationalRetrievalChain.from_llm(
-            llm=llm,
-            retriever=vectorstore.as_retriever(),
-            memory=memory
-        )
-        logging.info("Conversation chain created successfully.")
-        return conversation_chain
-    except Exception as e:
-        logging.error(f"Error creating conversation chain: {e}")
-        st.error("An error occurred while setting up the conversation chain.")
-# Handle user input
-def handle_userinput(user_question):
-    if st.session_state.conversation is not None:
-        response = st.session_state.conversation({'question': user_question})
-        st.session_state.chat_history = response['chat_history']
-        for i, message in enumerate(st.session_state.chat_history):
-            if i % 2 == 0:
-                st.write(f"*User:* {message.content}")
-            else:
-                st.write(f"*Bot:* {message.content}")
-    else:
-        st.warning("Please process the documents first.")
-# Main function to run the Streamlit app
-def main():
-    load_dotenv()
-    st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
-    if "conversation" not in st.session_state:
-        st.session_state.conversation = None
-    if "chat_history" not in st.session_state:
-        st.session_state.chat_history = None
-    st.header("Chat with multiple PDFs :books:")
-    user_question = st.text_input("Ask a question about your documents:")
-    if user_question:
-        handle_userinput(user_question)
-    with st.sidebar:
-        st.subheader("Your documents")
-        pdf_docs = st.file_uploader(
-            "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
-        )
-        if st.button("Process"):
-            with st.spinner("Processing..."):
-                raw_text = get_pdf_text(pdf_docs)
-                text_chunks = get_text_chunks(raw_text)
-                vectorstore = get_vectorstore(text_chunks)
-                st.session_state.conversation = get_conversation_chain(vectorstore)
-if __name__ == '__main__':
-    main()

+# import os
+# import logging
+# from dotenv import load_dotenv
+# import streamlit as st
+# from PyPDF2 import PdfReader
+# from langchain.text_splitter import CharacterTextSplitter
+# # from langchain.embeddings import HuggingFaceInstructEmbeddings
+# from langchain_cohere import CohereEmbeddings
+# from langchain.vectorstores import FAISS
+# from langchain.memory import ConversationBufferMemory
+# from langchain.chains import ConversationalRetrievalChain
+# # from langchain.llms import Ollama
+# from langchain_groq import ChatGroq
+# # Load environment variables
+# load_dotenv()
+# # Set up logging
+# logging.basicConfig(
+#     level=logging.INFO,
+#     format='%(asctime)s - %(levelname)s - %(message)s'
+# )
+# # Function to extract text from PDF files
+# def get_pdf_text(pdf_docs):
+#     text = ""
+#     for pdf in pdf_docs:
+#         pdf_reader = PdfReader(pdf)
+#         for page in pdf_reader.pages:
+#             text += page.extract_text()
+#     return text
+# # Function to split the extracted text into chunks
+# def get_text_chunks(text):
+#     text_splitter = CharacterTextSplitter(
+#         separator="\n",
+#         chunk_size=1000,
+#         chunk_overlap=200,
+#         length_function=len
+#     )
+#     chunks = text_splitter.split_text(text)
+#     return chunks
+# # Function to create a FAISS vectorstore
+# # def get_vectorstore(text_chunks):
+# #     embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
+# #     vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
+# #     return vectorstore
+# def get_vectorstore(text_chunks):
+#     cohere_api_key = os.getenv("COHERE_API_KEY")
+#     embeddings = CohereEmbeddings(model="embed-english-v3.0", cohere_api_key=cohere_api_key)
+#     vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
+#     return vectorstore
+# # Function to set up the conversational retrieval chain
+# def get_conversation_chain(vectorstore):
+#     try:
+#         # llm = Ollama(model="llama3.2:1b")
+#         llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.5)
+#         memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
+#         conversation_chain = ConversationalRetrievalChain.from_llm(
+#             llm=llm,
+#             retriever=vectorstore.as_retriever(),
+#             memory=memory
+#         )
+#         logging.info("Conversation chain created successfully.")
+#         return conversation_chain
+#     except Exception as e:
+#         logging.error(f"Error creating conversation chain: {e}")
+#         st.error("An error occurred while setting up the conversation chain.")
+# # Handle user input
+# def handle_userinput(user_question):
+#     if st.session_state.conversation is not None:
+#         response = st.session_state.conversation({'question': user_question})
+#         st.session_state.chat_history = response['chat_history']
+#         for i, message in enumerate(st.session_state.chat_history):
+#             if i % 2 == 0:
+#                 st.write(f"*User:* {message.content}")
+#             else:
+#                 st.write(f"*Bot:* {message.content}")
+#     else:
+#         st.warning("Please process the documents first.")
+# # Main function to run the Streamlit app
+# def main():
+#     load_dotenv()
+#     st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
+#     if "conversation" not in st.session_state:
+#         st.session_state.conversation = None
+#     if "chat_history" not in st.session_state:
+#         st.session_state.chat_history = None
+#     st.header("Chat with multiple PDFs :books:")
+#     user_question = st.text_input("Ask a question about your documents:")
+#     if user_question:
+#         handle_userinput(user_question)
+#     with st.sidebar:
+#         st.subheader("Your documents")
+#         pdf_docs = st.file_uploader(
+#             "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
+#         )
+#         if st.button("Process"):
+#             with st.spinner("Processing..."):
+#                 raw_text = get_pdf_text(pdf_docs)
+#                 text_chunks = get_text_chunks(raw_text)
+#                 vectorstore = get_vectorstore(text_chunks)
+#                 st.session_state.conversation = get_conversation_chain(vectorstore)
+# if __name__ == '__main__':
+#     main()
+import streamlit as st
 import os
 from dotenv import load_dotenv
+import PyPDF2
+import requests
+import cohere
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import FAISS
 from langchain_cohere import CohereEmbeddings
 # Load environment variables
 load_dotenv()
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+COHERE_API_KEY = os.getenv("COHERE_API_KEY")
+# Initialize Cohere client
+co = cohere.Client(COHERE_API_KEY)
+# Configure Streamlit
+st.set_page_config(page_title="RAG Chatbot with Gemini & Cohere")
+st.title("🤖 Multi-Model RAG Chatbot")
+# Initialize session state
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+if "vector_store" not in st.session_state:
+    st.session_state.vector_store = None
+# File upload and processing
+uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")
+if uploaded_file and not st.session_state.vector_store:
+    # Process PDF
+    pdf_reader = PyPDF2.PdfReader(uploaded_file)
     text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text()
+    # Split text
+    text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000,
+        chunk_overlap=200
     )
     chunks = text_splitter.split_text(text)
+    # Create embeddings and vector store
+    embeddings = CohereEmbeddings(
+        cohere_api_key=COHERE_API_KEY,
+        model="embed-english-v3.0",
+        user_agent="rag-chatbot-v1"
+    )
+    st.session_state.vector_store = FAISS.from_texts(
+        texts=chunks,
+        embedding=embeddings
+    )
+# Display chat messages
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+# Query expansion function
+def expand_query(query):
+    prompt = f"""Generate 3 query variations that help answer: {query}
+    Format as numbered bullet points:"""
+    response = co.generate(
+        prompt=prompt,
+        max_tokens=100,
+        temperature=0.7
+    )
+    expanded_queries = [query] + [q.split(". ")[1] for q in response.generations[0].text.split("\n") if q]
+    return expanded_queries
+# Gemini API call
+def generate_with_gemini(context, query):
+    url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={GEMINI_API_KEY}"
+    system_prompt = f"""You're an expert assistant. Use this context to answer:
+    {context}
+    Apply Chain of Abstraction and Grounding (CAG):
+    1. Identify key concepts
+    2. Create abstract relationships
+    3. Ground in specific examples
+    4. Synthesize final answer"""
+    headers = {"Content-Type": "application/json"}
+    data = {
+        "contents": [{
+            "parts": [{
+                "text": f"{system_prompt}\n\nQuestion: {query}"
+            }]
+        }]
+    }
+    response = requests.post(url, json=data, headers=headers)
+    return response.json()["candidates"][0]["content"]["parts"][0]["text"]
+# Chat input
+if prompt := st.chat_input("Ask about the document"):
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    with st.chat_message("user"):
+        st.markdown(prompt)
+    # Query expansion
+    expanded_queries = expand_query(prompt)
+    # Retrieve documents
+    docs = []
+    for query in expanded_queries:
+        docs.extend(st.session_state.vector_store.similarity_search(query, k=2))
+    # Generate response
+    context = "\n\n".join([doc.page_content for doc in docs])
+    response = generate_with_gemini(context, prompt)
+    with st.chat_message("assistant"):
+        st.markdown(response)
+    st.session_state.messages.append({"role": "assistant", "content": response})