Spaces:
Sleeping
Sleeping
import streamlit as st | |
from PyPDF2 import PdfReader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores import FAISS | |
from langchain.llms import HuggingFaceHub | |
from langchain.chains import RetrievalQAWithSourcesChain | |
import pandas as pd | |
import os | |
import io | |
# --- 1. Data Loading and Preprocessing --- | |
def load_and_process_pdfs_from_folder(docs_folder="docs"): | |
"""Loads and processes all PDF files from the specified folder.""" | |
all_text = "" | |
all_tables = [] | |
for filename in os.listdir(docs_folder): | |
if filename.endswith(".pdf"): | |
filepath = os.path.join(docs_folder, filename) | |
try: | |
with open(filepath, 'rb') as file: | |
pdf_reader = PdfReader(file) | |
for page in pdf_reader.pages: | |
all_text += page.extract_text() + "\n" | |
try: | |
for table in page.extract_tables(): | |
df = pd.DataFrame(table) | |
all_tables.append(df) | |
except Exception as e: | |
print(f"Could not extract tables from page in {filename}. Error: {e}") | |
except Exception as e: | |
st.error(f"Error reading PDF {filename}: {e}") | |
return all_text, all_tables | |
def split_text_into_chunks(text): | |
"""Splits the text into smaller, manageable chunks.""" | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
chunks = text_splitter.split_text(text) | |
return chunks | |
def create_vectorstore(chunks): | |
"""Creates a vectorstore from the text chunks using HuggingFace embeddings.""" | |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") | |
vectorstore = FAISS.from_texts(chunks, embeddings) | |
return vectorstore | |
# --- 2. Question Answering with RAG --- | |
def setup_llm(): | |
"""Sets up the Hugging Face Hub LLM.""" | |
llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512}) | |
return llm | |
def perform_rag(vectorstore, llm, query): | |
"""Performs retrieval-augmented generation.""" | |
qa_chain = RetrievalQAWithSourcesChain.from_llm(llm, retriever=vectorstore.as_retriever()) | |
result = qa_chain({"question": query}) | |
return result | |
# --- 3. Streamlit UI --- | |
def main(): | |
st.title("PDF Q&A with Local Docs") | |
st.info("Make sure you have a 'docs' folder in the same directory as this script containing your PDF files.") | |
with st.spinner("Loading and processing PDF(s)..."): | |
all_text, all_tables = load_and_process_pdfs_from_folder() | |
if all_text: | |
with st.spinner("Creating knowledge base..."): | |
chunks = split_text_into_chunks(all_text) | |
vectorstore = create_vectorstore(chunks) | |
llm = setup_llm() | |
query = st.text_input("Ask a question about the documents:") | |
if query: | |
with st.spinner("Searching for answer..."): | |
result = perform_rag(vectorstore, llm, query) | |
st.subheader("Answer:") | |
st.write(result["answer"]) | |
if "sources" in result: | |
st.subheader("Source:") | |
st.write(result["sources"]) | |
if all_tables: | |
st.subheader("Extracted Tables:") | |
for i, table_df in enumerate(all_tables): | |
st.write(f"Table {i+1}:") | |
st.dataframe(table_df) | |
elif not all_text: | |
st.warning("No PDF files found in the 'docs' folder.") | |
if __name__ == "__main__": | |
main() |