|
import streamlit as st |
|
import re |
|
import random |
|
import PyPDF2 |
|
import numpy as np |
|
from collections import defaultdict |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
import torch |
|
from transformers import AutoTokenizer, AutoModel |
|
|
|
|
|
|
|
|
|
def tokenize(text): |
|
return re.findall(r"\w+", text.lower()) |
|
|
|
|
|
|
|
|
|
class PDFQASystem: |
|
def __init__(self): |
|
self.text_chunks = [] |
|
self.embeddings = None |
|
self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") |
|
self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") |
|
self.model.eval() |
|
self.active_document = None |
|
|
|
def process_pdf_stream(self, uploaded_file): |
|
text = self._extract_pdf_text(uploaded_file) |
|
self.text_chunks = self._chunk_text(text) |
|
self.embeddings = self._embed(self.text_chunks) |
|
self.active_document = uploaded_file.name |
|
|
|
def _extract_pdf_text(self, uploaded_file): |
|
text = "" |
|
reader = PyPDF2.PdfReader(uploaded_file) |
|
for page in reader.pages: |
|
page_text = page.extract_text() |
|
if page_text: |
|
text += page_text |
|
return text |
|
|
|
def _chunk_text(self, text, chunk_size=500): |
|
words = text.split() |
|
return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] |
|
|
|
def _mean_pooling(self, model_output, attention_mask): |
|
token_embeddings = model_output.last_hidden_state |
|
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() |
|
return torch.sum(token_embeddings * input_mask_expanded, dim=1) / torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9) |
|
|
|
def _embed(self, texts): |
|
inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt") |
|
with torch.no_grad(): |
|
model_output = self.model(**inputs) |
|
embeddings = self._mean_pooling(model_output, inputs['attention_mask']) |
|
return torch.nn.functional.normalize(embeddings, p=2, dim=1).cpu().numpy() |
|
|
|
def answer_question(self, question): |
|
if not self.active_document: |
|
return "No document loaded. Please upload a PDF first." |
|
|
|
question_embedding = self._embed([question])[0] |
|
similarities = cosine_similarity([question_embedding], self.embeddings)[0] |
|
best_match_idx = np.argmax(similarities) |
|
return self.text_chunks[best_match_idx] |
|
|
|
|
|
|
|
|
|
class IntentClassifier: |
|
def __init__(self): |
|
self.intents = { |
|
"greet": ["hello", "hi", "hey"], |
|
"bye": ["bye", "goodbye", "exit"], |
|
"qa": ["what", "when", "how", "explain", "tell", "who", "why"], |
|
"help": ["help", "support", "assist"] |
|
} |
|
|
|
def predict(self, tokens): |
|
scores = defaultdict(int) |
|
for token in tokens: |
|
for intent, keywords in self.intents.items(): |
|
if token in keywords: |
|
scores[intent] += 1 |
|
return max(scores, key=scores.get) if scores else "qa" |
|
|
|
|
|
|
|
|
|
class DocumentAI: |
|
def __init__(self): |
|
self.intent_recognizer = IntentClassifier() |
|
self.qa_system = PDFQASystem() |
|
self.responses = { |
|
"greet": ["π Hello! I'm your document assistant.", "Hi there! Ready to answer your document questions."], |
|
"bye": ["Goodbye!", "See you later!", "Thanks for using the assistant!"], |
|
"help": "Upload a PDF and ask questions. Iβll answer from its content!", |
|
"no_doc": "Please upload a PDF document first." |
|
} |
|
|
|
def handle_query(self, text): |
|
tokens = tokenize(text) |
|
intent = self.intent_recognizer.predict(tokens) |
|
|
|
if intent == "greet": |
|
return random.choice(self.responses["greet"]) |
|
elif intent == "bye": |
|
return random.choice(self.responses["bye"]) |
|
elif intent == "help": |
|
return self.responses["help"] |
|
elif intent == "qa": |
|
if self.qa_system.active_document: |
|
return self.qa_system.answer_question(text) |
|
else: |
|
return self.responses["no_doc"] |
|
else: |
|
return "π€ Iβm not sure how to respond. Try saying 'help'." |
|
|
|
|
|
|
|
|
|
st.set_page_config(page_title="Document AI Assistant", page_icon="π") |
|
st.title("π AI PDF Assistant") |
|
st.markdown("Ask questions from uploaded PDF files!") |
|
|
|
ai = DocumentAI() |
|
|
|
uploaded_file = st.file_uploader("Upload a PDF", type="pdf") |
|
|
|
if uploaded_file: |
|
ai.qa_system.process_pdf_stream(uploaded_file) |
|
st.success(f"β
PDF '{uploaded_file.name}' processed successfully!") |
|
|
|
query = st.text_input("Ask a question from the document:") |
|
|
|
if query: |
|
answer = ai.handle_query(query) |
|
st.markdown(f"**π§ Answer:** {answer}") |