Spaces:

MathWizard1729
/

PDF_Chatbot_Gradio_ChromaDB

Sleeping

App Files Files Community

MathWizard1729 commited on May 27

Commit

6c67e85

verified ·

1 Parent(s): 9ce288c

Update app.py

Browse files

Files changed (1) hide show

app.py +232 -210

app.py CHANGED Viewed

@@ -1,217 +1,239 @@
-import os
-import logging
-import gradio as gr
-from dotenv import load_dotenv
-from langchain_aws import ChatBedrock
 from langchain_chroma import Chroma
-from langchain_community.embeddings import BedrockEmbeddings
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.runnables import RunnablePassthrough
-from langchain_core.output_parsers import StrOutputParser
-from botocore.exceptions import ClientError
-from indexer import index_uploaded_pdfs
-# Set up logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# Global variables to store state
-vector_store = None
-indexing_status = None
-mode = "General Chat"
-chat_history = []
-def load_environment():
-    """Load environment variables from .env file or system environment."""
-    load_dotenv()
-    required_vars = ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_REGION']
-    for var in required_vars:
-        if not os.getenv(var):
-            logger.error(f"Missing environment variable: {var}")
-            raise ValueError(f"Missing environment variable: {var}")
-    logger.info("Environment variables loaded successfully")
-def initialize_embeddings():
-    """Initialize Amazon Bedrock embeddings."""
-    try:
-        embeddings = BedrockEmbeddings(
-            model_id="amazon.titan-embed-text-v1",
-            region_name=os.getenv("AWS_REGION")
-        )
-        logger.info("Initialized Bedrock embeddings")
-        return embeddings
-    except ClientError as e:
-        logger.error(f"Error initializing Bedrock embeddings: {str(e)}")
-        raise
-def initialize_vector_store(db_directory="./chroma_db", collection_name="pdf_rag"):
-    """Initialize Chroma vector store."""
     try:
-        embeddings = initialize_embeddings()
-        vector_store = Chroma(
-            collection_name=collection_name,
-            embedding_function=embeddings,
-            persist_directory=db_directory
         )
-        logger.info(f"Initialized Chroma vector store from {db_directory}")
-        return vector_store
     except Exception as e:
-        logger.error(f"Error initializing Chroma vector store: {str(e)}")
-        raise
-def initialize_llm():
-    """Initialize Anthropic Claude model via Bedrock."""
-    try:
-        llm = ChatBedrock(
-            model_id="anthropic.claude-3-5-sonnet-20240620-v1:0",
-            region_name=os.getenv("AWS_REGION"),
-            model_kwargs={"max_tokens": 1000}
-        )
-        logger.info("Initialized Claude 3.5 Sonnet model")
-        return llm
-    except ClientError as e:
-        logger.error(f"Error initializing Claude model: {str(e)}")
-        raise
-def create_rag_chain(vector_store, llm):
-    """Create RAG chain with vector store and LLM."""
-    try:
-        retriever = vector_store.as_retriever(search_kwargs={"k": 3})
-        prompt_template = """You are a helpful assistant. Use the following context to answer the user's question, focusing on extracting relevant skills or information if applicable.
-        If you don't know the answer, say so, but try to provide a helpful response based on the context.
-        Context:
-        {context}
-        Question: {question}
-        Answer:
-        """
-        prompt = ChatPromptTemplate.from_template(prompt_template)
-        rag_chain = (
-            {"context": retriever | (lambda docs: "\n\n".join(doc.page_content for doc in docs)), "question": RunnablePassthrough()}
-            | prompt
-            | llm
-            | StrOutputParser()
         )
-        logger.info("Initialized RAG chain")
-        return rag_chain
-    except Exception as e:
-        logger.error(f"Error creating RAG chain: {str(e)}")
-        raise
-def create_general_chat_chain(llm):
-    """Create a general chat chain without RAG."""
-    try:
-        prompt_template = """You are a helpful assistant. Answer the user's question to the best of your knowledge.
-        Question: {question}
-        Answer:
-        """
-        prompt = ChatPromptTemplate.from_template(prompt_template)
-        chat_chain = (
-            {"question": RunnablePassthrough()}
-            | prompt
-            | llm
-            | StrOutputParser()
         )
-        logger.info("Initialized general chat chain")
-        return chat_chain
-    except Exception as e:
-        logger.error(f"Error creating general chat chain: {str(e)}")
-        raise
-def handle_pdf_upload(uploaded_files):
-    """Handle PDF uploads and index them."""
-    global vector_store, indexing_status, mode
-    if uploaded_files:
-        try:
-            vector_store, indexing_status = index_uploaded_pdfs(uploaded_files)
-            if indexing_status["pdf_count"] > 0:
-                mode = "PDF RAG"
-                return (
-                    f"Indexed {indexing_status['pdf_count']} PDFs, "
-                    f"{indexing_status['page_count']} pages, "
-                    f"{indexing_status['chunk_count']} chunks. "
-                    f"Database stored at {indexing_status['db_location']}.\n\nMode switched to: {mode}"
-                )
-            else:
-                mode = "General Chat"
-                return "No PDFs were indexed. Please upload valid PDF files.\n\nMode remains: General Chat"
-        except Exception as e:
-            logger.error(f"Error indexing PDFs: {str(e)}")
-            return f"Error indexing PDFs: {str(e)}\n\nMode remains: General Chat"
-    return "No PDFs uploaded.\n\nMode remains: General Chat"
-def chat(message, history):
-    """Handle chat interactions."""
-    global vector_store, mode, chat_history
-    try:
-        # Initialize LLM
-        llm = initialize_llm()
-        # Select appropriate chain
-        if vector_store and mode == "PDF RAG":
-            chain = create_rag_chain(vector_store, llm)
-        else:
-            chain = create_general_chat_chain(llm)
-        # Update chat history
-        chat_history = history or []
-        chat_history.append(("user", message))
-        # Get response
-        response = chain.invoke(message)
-        # Update chat history
-        chat_history.append(("assistant", response))
-        # Format history for Gradio
-        formatted_history = []
-        for role, content in chat_history:
-            if role == "user":
-                formatted_history.append((content, None))
-            else:
-                formatted_history.append((None, content))
-        return formatted_history, response
-    except Exception as e:
-        logger.error(f"Error generating response: {str(e)}")
-        return chat_history, f"Error generating response: {str(e)}"
-def main():
-    """Main function to create Gradio interface."""
-    try:
-        # Load environment
-        load_environment()
-        # Gradio interface
-        with gr.Blocks(title="Chatbot with Optional PDF Upload") as demo:
-            gr.Markdown("# Chatbot with Optional PDF Upload")
-            gr.Markdown("Chat with the bot directly or upload PDFs to enable RAG-based queries (e.g., extracting skills).")
-            # PDF uploader
-            pdf_input = gr.Files(label="Upload PDF files (optional)", file_types=[".pdf"])
-            # Indexing status display
-            indexing_output = gr.Textbox(label="Indexing Status", value=f"Current Mode: {mode}")
-            # Chat interface
-            chatbot = gr.Chatbot(label="Chat")
-            msg = gr.Textbox(label="Your Question", placeholder="Ask a question...")
-            clear = gr.Button("Clear Chat")
-            # Event handlers
-            pdf_input.upload(handle_pdf_upload, inputs=pdf_input, outputs=indexing_output)
-            msg.submit(chat, inputs=[msg, chatbot], outputs=[chatbot, msg])
-            clear.click(lambda: ([], "Chat cleared.\n\nCurrent Mode: " + mode), None, [chatbot, msg])
-        return demo
-    except Exception as e:
-        logger.error(f"Gradio interface initialization failed: {str(e)}")
-        raise
-if __name__ == "__main__":
-    demo = main()
-    demo.launch()

+import streamlit as st
+import boto3
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_aws import BedrockEmbeddings
 from langchain_chroma import Chroma
+from langchain_aws import ChatBedrock
+from langchain.prompts import ChatPromptTemplate
+from langchain.schema import StrOutputParser
+from langchain.schema.runnable import RunnablePassthrough
+import os
+from dotenv import load_dotenv # Import load_dotenv
+# --- Load Environment Variables ---
+load_dotenv() # This loads variables from .env file
+# --- Streamlit UI Setup (MUST BE THE FIRST STREAMLIT COMMAND) ---
+st.set_page_config(
+    page_title="Math Research Paper RAG Bot",
+    page_icon="📚",
+    layout="wide"
+)
+st.title("📚 Math Research Paper RAG Chatbot")
+st.markdown(
+    """
+    Upload a mathematical research paper (PDF) and ask questions about its content.
+    This bot uses Amazon Bedrock (Claude 3 Sonnet for reasoning, Titan Embeddings for vectors)
+    and ChromaDB for Retrieval-Augmented Generation.
+    **Note:** This application requires AWS credentials (`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`)
+    and region (`AWS_REGION`) to be set up in a `.env` file or environment variables.
+    """
+)
+# --- Configuration ---
+# Set AWS region (adjust if needed, loaded from .env or env var)
+AWS_REGION = os.getenv("AWS_REGION")
+if not AWS_REGION:
+    st.error("AWS_REGION not found in environment variables or .env file. Please set it.")
+    st.stop()
+# Bedrock model IDs
+EMBEDDING_MODEL_ID = "amazon.titan-embed-text-v1"
+# Claude 4 is not generally available via Bedrock. Using Claude 3 Sonnet.
+LLM_MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0"
+# --- Initialize Bedrock Client (once) ---
+@st.cache_resource
+def get_bedrock_client():
+    """Initializes and returns a boto3 Bedrock client.
+    Returns: Tuple (boto3_client, success_bool, error_message_str or None)
+    """
     try:
+        client = boto3.client(
+            service_name="bedrock-runtime",
+            region_name=AWS_REGION
         )
+        # Optional: Verify credentials by trying a simple API call.
+        # This will raise an exception if permissions/credentials are wrong.
+        # client.list_foundation_models(byOutputModality='TEXT')
+        return client, True, None # Success: client, True, no error message
     except Exception as e:
+        return None, False, str(e) # Failure: None, False, error message
+# Get the client and check its status
+bedrock_client, bedrock_success, bedrock_error_msg = get_bedrock_client()
+if not bedrock_success:
+    st.error(f"Error connecting to AWS Bedrock. Please check your AWS credentials (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) and region (AWS_REGION) in your .env file or environment variables. Error: {bedrock_error_msg}")
+    st.stop() # Stop execution if Bedrock client cannot be initialized
+else:
+    st.success(f"Successfully connected to AWS Bedrock in {AWS_REGION}!")
+# --- LangChain Components ---
+@st.cache_resource
+def get_embeddings_model(_client): # Prepend underscore to tell Streamlit not to hash
+    """Returns the BedrockEmbeddings model."""
+    return BedrockEmbeddings(client=_client, model_id=EMBEDDING_MODEL_ID)
+@st.cache_resource
+def get_llm_model(_client): # Prepend underscore to tell Streamlit not to hash
+    """Returns the Bedrock LLM model for Claude 3 Sonnet."""
+    return ChatBedrock(
+        client=_client,
+        model_id=LLM_MODEL_ID,
+        streaming=False, # <--- CHANGED: Set streaming to False
+        temperature=0.1, # Lower temperature for factual accuracy in research
+        model_kwargs={"max_tokens": 4000} # Claude 3 can handle larger outputs
+    )
+# --- PDF Processing and Vector Store Creation ---
+def create_vector_store(pdf_file_path):
+    """
+    Loads PDF, chunks it contextually for mathematical papers,
+    creates embeddings, and stores them in ChromaDB.
+    """
+    with st.spinner("Loading PDF and creating vector store..."):
+        # 1. Load PDF
+        loader = PyPDFLoader(pdf_file_path)
+        pages = loader.load_and_split()
+        st.info(f"Loaded {len(pages)} pages from the PDF.")
+        # 2. Contextual Chunking for Mathematical Papers
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1500,  # Increased chunk size for math papers
+            chunk_overlap=150, # Generous overlap to maintain context
+            separators=[
+                "\n\n",  # Prefer splitting by paragraphs
+                "\n",    # Then by newlines (might break equations but less likely than fixed char)
+                " ",     # Then by spaces
+                "",      # Fallback
+            ],
+            length_function=len,
+            is_separator_regex=False,
         )
+        chunks = text_splitter.split_documents(pages)
+        st.info(f"Split PDF into {len(chunks)} chunks.")
+        # 3. Create Embeddings and ChromaDB
+        # Pass the bedrock_client to the cached embedding model function
+        embeddings = get_embeddings_model(bedrock_client)
+        vector_store = Chroma.from_documents(
+            documents=chunks,
+            embedding=embeddings,
+            persist_directory="./chroma_db" # Persist for faster reloads (optional)
         )
+        st.success("Vector store created and ready!")
+        return vector_store
+# --- RAG Chain Construction ---
+def get_rag_chain(vector_store):
+    """Constructs the RAG chain using LCEL."""
+    retriever = vector_store.as_retriever(search_kwargs={"k": 5}) # Retrieve top 5 relevant chunks
+    # Pass the bedrock_client to the cached LLM model function
+    llm = get_llm_model(bedrock_client)
+    # Prompt Template optimized for mathematical research papers
+    prompt_template = ChatPromptTemplate.from_messages(
+        [
+            ("system",
+             "You are an expert AI assistant specialized in analyzing and explaining mathematical research papers. "
+             "Your goal is to provide precise, accurate, and concise answers based *only* on the provided context from the research paper. "
+             "When answering, focus on definitions, theorems, proofs, key mathematical concepts, and experimental results. "
+             "If the user asks about a mathematical notation, try to explain its meaning from the context. "
+             "If the answer is not found in the context, explicitly state that you cannot find the information within the provided document. "
+             "Do not invent information or make assumptions outside the given text.\n\n"
+             "Context:\n{context}"),
+            ("user", "{question}"),
+        ]
+    )
+    rag_chain = (
+        {"context": retriever, "question": RunnablePassthrough()}
+        | prompt_template
+        | llm
+        | StrOutputParser()
+    )
+    return rag_chain
+# --- Streamlit UI Main Logic ---
+# Initialize chat history
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+# Initialize vector store and RAG chain
+if "vector_store" not in st.session_state:
+    st.session_state.vector_store = None
+if "rag_chain" not in st.session_state:
+    st.session_state.rag_chain = None
+if "pdf_uploaded" not in st.session_state:
+    st.session_state.pdf_uploaded = False
+# Sidebar for PDF Upload
+with st.sidebar:
+    st.header("Upload PDF")
+    uploaded_file = st.file_uploader(
+        "Choose a PDF file",
+        type="pdf",
+        accept_multiple_files=False,
+        key="pdf_uploader"
+    )
+    if uploaded_file and not st.session_state.pdf_uploaded:
+        # Save the uploaded file temporarily
+        with open("temp_doc.pdf", "wb") as f:
+            f.write(uploaded_file.getbuffer())
+        st.session_state.vector_store = create_vector_store("temp_doc.pdf")
+        st.session_state.rag_chain = get_rag_chain(st.session_state.vector_store)
+        st.session_state.pdf_uploaded = True
+        st.success("PDF processed successfully! You can now ask questions.")
+        # Clean up temporary file
+        os.remove("temp_doc.pdf")
+    elif st.session_state.pdf_uploaded:
+        st.info("PDF already processed. Ready for questions!")
+# Display chat messages from history on app rerun
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+# Accept user input
+if prompt := st.chat_input("Ask a question about the paper..."):
+    if not st.session_state.pdf_uploaded:
+        st.warning("Please upload a PDF first to start asking questions.")
+    else:
+        # Add user message to chat history
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        with st.chat_message("user"):
+            st.markdown(prompt)
+        # Get response from RAG chain
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking..."):
+                try:
+                    # <--- CHANGED: Use invoke() instead of stream()
+                    full_response = st.session_state.rag_chain.invoke(prompt)
+                    st.markdown(full_response, unsafe_allow_html=True)
+                    # Add assistant response to chat history
+                    st.session_state.messages.append({"role": "assistant", "content": full_response})
+                except Exception as e:
+                    st.error(f"An error occurred during response generation: {e}")
+                    st.warning("Please try again or check your AWS Bedrock access permissions.")
+# Optional: Clear chat and uploaded PDF
+if st.session_state.pdf_uploaded:
+    if st.sidebar.button("Clear Chat and Upload New PDF"):
+        st.session_state.messages = []
+        st.session_state.vector_store = None
+        st.session_state.rag_chain = None
+        st.session_state.pdf_uploaded = False
+        st.cache_resource.clear() # Clear streamlit caches for a clean slate
+        st.rerun()