Spaces:

gaur3009
/

PDFQA

Sleeping

App Files Files Community

PDFQA / src /streamlit_app.py

gaur3009

Update src/streamlit_app.py

03a8095 verified 4 months ago

raw

history blame

5.09 kB

	import streamlit as st
	import re
	import random
	import PyPDF2
	import numpy as np
	from collections import defaultdict
	from sklearn.metrics.pairwise import cosine_similarity

	import torch
	from transformers import AutoTokenizer, AutoModel

	# ---------------------
	# Tokenization
	# ---------------------
	def tokenize(text):
	return re.findall(r"\w+", text.lower())

	# ---------------------
	# PDF QA System
	# ---------------------
	class PDFQASystem:
	def __init__(self):
	self.text_chunks = []
	self.embeddings = None
	self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
	self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
	self.model.eval()
	self.active_document = None

	def process_pdf_stream(self, uploaded_file):
	text = self._extract_pdf_text(uploaded_file)
	self.text_chunks = self._chunk_text(text)
	self.embeddings = self._embed(self.text_chunks)
	self.active_document = uploaded_file.name

	def _extract_pdf_text(self, uploaded_file):
	text = ""
	reader = PyPDF2.PdfReader(uploaded_file)
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text
	return text

	def _chunk_text(self, text, chunk_size=500):
	words = text.split()
	return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

	def _mean_pooling(self, model_output, attention_mask):
	token_embeddings = model_output.last_hidden_state
	input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
	return torch.sum(token_embeddings * input_mask_expanded, dim=1) / torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)

	def _embed(self, texts):
	inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
	with torch.no_grad():
	model_output = self.model(**inputs)
	embeddings = self._mean_pooling(model_output, inputs['attention_mask'])
	return torch.nn.functional.normalize(embeddings, p=2, dim=1).cpu().numpy()

	def answer_question(self, question):
	if not self.active_document:
	return "No document loaded. Please upload a PDF first."

	question_embedding = self._embed([question])[0]
	similarities = cosine_similarity([question_embedding], self.embeddings)[0]
	best_match_idx = np.argmax(similarities)
	return self.text_chunks[best_match_idx]

	# ---------------------
	# Intent Classifier
	# ---------------------
	class IntentClassifier:
	def __init__(self):
	self.intents = {
	"greet": ["hello", "hi", "hey"],
	"bye": ["bye", "goodbye", "exit"],
	"qa": ["what", "when", "how", "explain", "tell", "who", "why"],
	"help": ["help", "support", "assist"]
	}

	def predict(self, tokens):
	scores = defaultdict(int)
	for token in tokens:
	for intent, keywords in self.intents.items():
	if token in keywords:
	scores[intent] += 1
	return max(scores, key=scores.get) if scores else "qa"

	# ---------------------
	# AI Agent Core
	# ---------------------
	class DocumentAI:
	def __init__(self):
	self.intent_recognizer = IntentClassifier()
	self.qa_system = PDFQASystem()
	self.responses = {
	"greet": ["👋 Hello! I'm your document assistant.", "Hi there! Ready to answer your document questions."],
	"bye": ["Goodbye!", "See you later!", "Thanks for using the assistant!"],
	"help": "Upload a PDF and ask questions. I’ll answer from its content!",
	"no_doc": "Please upload a PDF document first."
	}

	def handle_query(self, text):
	tokens = tokenize(text)
	intent = self.intent_recognizer.predict(tokens)

	if intent == "greet":
	return random.choice(self.responses["greet"])
	elif intent == "bye":
	return random.choice(self.responses["bye"])
	elif intent == "help":
	return self.responses["help"]
	elif intent == "qa":
	if self.qa_system.active_document:
	return self.qa_system.answer_question(text)
	else:
	return self.responses["no_doc"]
	else:
	return "🤖 I’m not sure how to respond. Try saying 'help'."

	# ---------------------
	# Streamlit UI
	# ---------------------
	st.set_page_config(page_title="Document AI Assistant", page_icon="📄")
	st.title("📄 AI PDF Assistant")
	st.markdown("Ask questions from uploaded PDF files!")

	ai = DocumentAI()

	uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

	if uploaded_file:
	ai.qa_system.process_pdf_stream(uploaded_file)
	st.success(f"✅ PDF '{uploaded_file.name}' processed successfully!")

	query = st.text_input("Ask a question from the document:")

	if query:
	answer = ai.handle_query(query)
	st.markdown(f"🧠 Answer: {answer}")