PDFQA / src /streamlit_app.py
gaur3009's picture
Update src/streamlit_app.py
03a8095 verified
raw
history blame
5.09 kB
import streamlit as st
import re
import random
import PyPDF2
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModel
# ---------------------
# Tokenization
# ---------------------
def tokenize(text):
return re.findall(r"\w+", text.lower())
# ---------------------
# PDF QA System
# ---------------------
class PDFQASystem:
def __init__(self):
self.text_chunks = []
self.embeddings = None
self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
self.model.eval()
self.active_document = None
def process_pdf_stream(self, uploaded_file):
text = self._extract_pdf_text(uploaded_file)
self.text_chunks = self._chunk_text(text)
self.embeddings = self._embed(self.text_chunks)
self.active_document = uploaded_file.name
def _extract_pdf_text(self, uploaded_file):
text = ""
reader = PyPDF2.PdfReader(uploaded_file)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text
return text
def _chunk_text(self, text, chunk_size=500):
words = text.split()
return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
def _mean_pooling(self, model_output, attention_mask):
token_embeddings = model_output.last_hidden_state
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, dim=1) / torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
def _embed(self, texts):
inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
model_output = self.model(**inputs)
embeddings = self._mean_pooling(model_output, inputs['attention_mask'])
return torch.nn.functional.normalize(embeddings, p=2, dim=1).cpu().numpy()
def answer_question(self, question):
if not self.active_document:
return "No document loaded. Please upload a PDF first."
question_embedding = self._embed([question])[0]
similarities = cosine_similarity([question_embedding], self.embeddings)[0]
best_match_idx = np.argmax(similarities)
return self.text_chunks[best_match_idx]
# ---------------------
# Intent Classifier
# ---------------------
class IntentClassifier:
def __init__(self):
self.intents = {
"greet": ["hello", "hi", "hey"],
"bye": ["bye", "goodbye", "exit"],
"qa": ["what", "when", "how", "explain", "tell", "who", "why"],
"help": ["help", "support", "assist"]
}
def predict(self, tokens):
scores = defaultdict(int)
for token in tokens:
for intent, keywords in self.intents.items():
if token in keywords:
scores[intent] += 1
return max(scores, key=scores.get) if scores else "qa"
# ---------------------
# AI Agent Core
# ---------------------
class DocumentAI:
def __init__(self):
self.intent_recognizer = IntentClassifier()
self.qa_system = PDFQASystem()
self.responses = {
"greet": ["πŸ‘‹ Hello! I'm your document assistant.", "Hi there! Ready to answer your document questions."],
"bye": ["Goodbye!", "See you later!", "Thanks for using the assistant!"],
"help": "Upload a PDF and ask questions. I’ll answer from its content!",
"no_doc": "Please upload a PDF document first."
}
def handle_query(self, text):
tokens = tokenize(text)
intent = self.intent_recognizer.predict(tokens)
if intent == "greet":
return random.choice(self.responses["greet"])
elif intent == "bye":
return random.choice(self.responses["bye"])
elif intent == "help":
return self.responses["help"]
elif intent == "qa":
if self.qa_system.active_document:
return self.qa_system.answer_question(text)
else:
return self.responses["no_doc"]
else:
return "πŸ€– I’m not sure how to respond. Try saying 'help'."
# ---------------------
# Streamlit UI
# ---------------------
st.set_page_config(page_title="Document AI Assistant", page_icon="πŸ“„")
st.title("πŸ“„ AI PDF Assistant")
st.markdown("Ask questions from uploaded PDF files!")
ai = DocumentAI()
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
if uploaded_file:
ai.qa_system.process_pdf_stream(uploaded_file)
st.success(f"βœ… PDF '{uploaded_file.name}' processed successfully!")
query = st.text_input("Ask a question from the document:")
if query:
answer = ai.handle_query(query)
st.markdown(f"**🧠 Answer:** {answer}")