import streamlit as st import re import random import PyPDF2 import numpy as np from collections import defaultdict from sklearn.metrics.pairwise import cosine_similarity import torch from transformers import AutoTokenizer, AutoModel # --------------------- # Tokenization # --------------------- def tokenize(text): return re.findall(r"\w+", text.lower()) # --------------------- # PDF QA System # --------------------- class PDFQASystem: def __init__(self): self.text_chunks = [] self.embeddings = None self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") self.model.eval() self.active_document = None def process_pdf_stream(self, uploaded_file): text = self._extract_pdf_text(uploaded_file) self.text_chunks = self._chunk_text(text) self.embeddings = self._embed(self.text_chunks) self.active_document = uploaded_file.name def _extract_pdf_text(self, uploaded_file): text = "" reader = PyPDF2.PdfReader(uploaded_file) for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text return text def _chunk_text(self, text, chunk_size=500): words = text.split() return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] def _mean_pooling(self, model_output, attention_mask): token_embeddings = model_output.last_hidden_state input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, dim=1) / torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9) def _embed(self, texts): inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt") with torch.no_grad(): model_output = self.model(**inputs) embeddings = self._mean_pooling(model_output, inputs['attention_mask']) return torch.nn.functional.normalize(embeddings, p=2, dim=1).cpu().numpy() def answer_question(self, question): if not self.active_document: return "No document loaded. Please upload a PDF first." question_embedding = self._embed([question])[0] similarities = cosine_similarity([question_embedding], self.embeddings)[0] best_match_idx = np.argmax(similarities) return self.text_chunks[best_match_idx] # --------------------- # Intent Classifier # --------------------- class IntentClassifier: def __init__(self): self.intents = { "greet": ["hello", "hi", "hey"], "bye": ["bye", "goodbye", "exit"], "qa": ["what", "when", "how", "explain", "tell", "who", "why"], "help": ["help", "support", "assist"] } def predict(self, tokens): scores = defaultdict(int) for token in tokens: for intent, keywords in self.intents.items(): if token in keywords: scores[intent] += 1 return max(scores, key=scores.get) if scores else "qa" # --------------------- # AI Agent Core # --------------------- class DocumentAI: def __init__(self): self.intent_recognizer = IntentClassifier() self.qa_system = PDFQASystem() self.responses = { "greet": ["👋 Hello! I'm your document assistant.", "Hi there! Ready to answer your document questions."], "bye": ["Goodbye!", "See you later!", "Thanks for using the assistant!"], "help": "Upload a PDF and ask questions. I’ll answer from its content!", "no_doc": "Please upload a PDF document first." } def handle_query(self, text): tokens = tokenize(text) intent = self.intent_recognizer.predict(tokens) if intent == "greet": return random.choice(self.responses["greet"]) elif intent == "bye": return random.choice(self.responses["bye"]) elif intent == "help": return self.responses["help"] elif intent == "qa": if self.qa_system.active_document: return self.qa_system.answer_question(text) else: return self.responses["no_doc"] else: return "🤖 I’m not sure how to respond. Try saying 'help'." # --------------------- # Streamlit UI # --------------------- st.set_page_config(page_title="Document AI Assistant", page_icon="📄") st.title("📄 AI PDF Assistant") st.markdown("Ask questions from uploaded PDF files!") ai = DocumentAI() uploaded_file = st.file_uploader("Upload a PDF", type="pdf") if uploaded_file: ai.qa_system.process_pdf_stream(uploaded_file) st.success(f"✅ PDF '{uploaded_file.name}' processed successfully!") query = st.text_input("Ask a question from the document:") if query: answer = ai.handle_query(query) st.markdown(f"**🧠 Answer:** {answer}")