AIToyBot

Sleeping

App Files Files Community

masadonline commited on May 18

Commit

f4e7b4f

verified ·

1 Parent(s): b5d8c4f

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -0

app.py CHANGED Viewed

	@@ -0,0 +1,99 @@

+import streamlit as st
+from PyPDF2 import PdfReader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.llms import HuggingFaceHub
+from langchain.chains import RetrievalQAWithSourcesChain
+import pandas as pd
+import os
+import io
+# --- 1. Data Loading and Preprocessing ---
+@st.cache_data()
+def load_and_process_pdfs_from_folder(docs_folder="docs"):
+    """Loads and processes all PDF files from the specified folder."""
+    all_text = ""
+    all_tables = []
+    for filename in os.listdir(docs_folder):
+        if filename.endswith(".pdf"):
+            filepath = os.path.join(docs_folder, filename)
+            try:
+                with open(filepath, 'rb') as file:
+                    pdf_reader = PdfReader(file)
+                    for page in pdf_reader.pages:
+                        all_text += page.extract_text() + "\n"
+                        try:
+                            for table in page.extract_tables():
+                                df = pd.DataFrame(table)
+                                all_tables.append(df)
+                        except Exception as e:
+                            print(f"Could not extract tables from page in {filename}. Error: {e}")
+            except Exception as e:
+                st.error(f"Error reading PDF {filename}: {e}")
+    return all_text, all_tables
+@st.cache_data()
+def split_text_into_chunks(text):
+    """Splits the text into smaller, manageable chunks."""
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+    chunks = text_splitter.split_text(text)
+    return chunks
+@st.cache_data()
+def create_vectorstore(chunks):
+    """Creates a vectorstore from the text chunks using HuggingFace embeddings."""
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
+    vectorstore = FAISS.from_texts(chunks, embeddings)
+    return vectorstore
+# --- 2. Question Answering with RAG ---
+@st.cache_resource()
+def setup_llm():
+    """Sets up the Hugging Face Hub LLM."""
+    llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512})
+    return llm
+def perform_rag(vectorstore, llm, query):
+    """Performs retrieval-augmented generation."""
+    qa_chain = RetrievalQAWithSourcesChain.from_llm(llm, retriever=vectorstore.as_retriever())
+    result = qa_chain({"question": query})
+    return result
+# --- 3. Streamlit UI ---
+def main():
+    st.title("PDF Q&A with Local Docs")
+    st.info("Make sure you have a 'docs' folder in the same directory as this script containing your PDF files.")
+    with st.spinner("Loading and processing PDF(s)..."):
+        all_text, all_tables = load_and_process_pdfs_from_folder()
+    if all_text:
+        with st.spinner("Creating knowledge base..."):
+            chunks = split_text_into_chunks(all_text)
+            vectorstore = create_vectorstore(chunks)
+            llm = setup_llm()
+        query = st.text_input("Ask a question about the documents:")
+        if query:
+            with st.spinner("Searching for answer..."):
+                result = perform_rag(vectorstore, llm, query)
+                st.subheader("Answer:")
+                st.write(result["answer"])
+                if "sources" in result:
+                    st.subheader("Source:")
+                    st.write(result["sources"])
+    if all_tables:
+        st.subheader("Extracted Tables:")
+        for i, table_df in enumerate(all_tables):
+            st.write(f"Table {i+1}:")
+            st.dataframe(table_df)
+    elif not all_text:
+        st.warning("No PDF files found in the 'docs' folder.")
+if __name__ == "__main__":
+    main()