PDFQA / src /streamlit_app.py
gaur3009's picture
Update src/streamlit_app.py
75d88e1 verified
raw
history blame
4.25 kB
import streamlit as st
import re
import random
import PyPDF2
import numpy as np
from collections import defaultdict, deque
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# ---------------------
# Tokenization
# ---------------------
def tokenize(text):
return re.findall(r"\w+", text.lower())
# ---------------------
# PDF QA System
# ---------------------
class PDFQASystem:
def __init__(self):
self.text_chunks = []
self.embeddings = None
self.model = SentenceTransformer('all-MiniLM-L6-v2')
self.active_document = None
def process_pdf_stream(self, uploaded_file):
text = self._extract_pdf_text(uploaded_file)
self.text_chunks = self._chunk_text(text)
self.embeddings = self.model.encode(self.text_chunks)
self.active_document = uploaded_file.name
def _extract_pdf_text(self, uploaded_file):
text = ""
reader = PyPDF2.PdfReader(uploaded_file)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text
return text
def _chunk_text(self, text, chunk_size=500):
words = text.split()
return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
def answer_question(self, question):
if not self.active_document:
return "No document loaded. Please upload a PDF first."
question_embedding = self.model.encode(question)
similarities = cosine_similarity([question_embedding], self.embeddings)[0]
best_match_idx = np.argmax(similarities)
return self.text_chunks[best_match_idx]
# ---------------------
# Intent Classifier
# ---------------------
class IntentClassifier:
def __init__(self):
self.intents = {
"greet": ["hello", "hi", "hey"],
"bye": ["bye", "goodbye", "exit"],
"qa": ["what", "when", "how", "explain", "tell", "who", "why"],
"help": ["help", "support", "assist"]
}
def predict(self, tokens):
scores = defaultdict(int)
for token in tokens:
for intent, keywords in self.intents.items():
if token in keywords:
scores[intent] += 1
return max(scores, key=scores.get) if scores else "qa"
# ---------------------
# AI Agent Core
# ---------------------
class DocumentAI:
def __init__(self):
self.intent_recognizer = IntentClassifier()
self.qa_system = PDFQASystem()
self.responses = {
"greet": ["πŸ‘‹ Hello! I'm your document assistant.", "Hi there! Ready to answer your document questions."],
"bye": ["Goodbye!", "See you later!", "Thanks for using the assistant!"],
"help": "Upload a PDF and ask questions. I’ll answer from its content!",
"no_doc": "Please upload a PDF document first."
}
def handle_query(self, text):
tokens = tokenize(text)
intent = self.intent_recognizer.predict(tokens)
if intent == "greet":
return random.choice(self.responses["greet"])
elif intent == "bye":
return random.choice(self.responses["bye"])
elif intent == "help":
return self.responses["help"]
elif intent == "qa":
if self.qa_system.active_document:
return self.qa_system.answer_question(text)
else:
return self.responses["no_doc"]
else:
return "πŸ€– I’m not sure how to respond. Try saying 'help'."
# ---------------------
# Streamlit UI
# ---------------------
st.set_page_config(page_title="Document AI Assistant", page_icon="πŸ“„")
st.title("πŸ“„ AI PDF Assistant")
st.markdown("Ask questions from uploaded PDF files!")
ai = DocumentAI()
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
if uploaded_file:
ai.qa_system.process_pdf_stream(uploaded_file)
st.success(f"βœ… PDF '{uploaded_file.name}' processed successfully!")
query = st.text_input("Ask a question from the document:")
if query:
answer = ai.handle_query(query)
st.markdown(f"**🧠 Answer:** {answer}")