File size: 4,254 Bytes
33eca15 323149c 75d88e1 323149c 33eca15 323149c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import streamlit as st
import re
import random
import PyPDF2
import numpy as np
from collections import defaultdict, deque
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# ---------------------
# Tokenization
# ---------------------
def tokenize(text):
return re.findall(r"\w+", text.lower())
# ---------------------
# PDF QA System
# ---------------------
class PDFQASystem:
def __init__(self):
self.text_chunks = []
self.embeddings = None
self.model = SentenceTransformer('all-MiniLM-L6-v2')
self.active_document = None
def process_pdf_stream(self, uploaded_file):
text = self._extract_pdf_text(uploaded_file)
self.text_chunks = self._chunk_text(text)
self.embeddings = self.model.encode(self.text_chunks)
self.active_document = uploaded_file.name
def _extract_pdf_text(self, uploaded_file):
text = ""
reader = PyPDF2.PdfReader(uploaded_file)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text
return text
def _chunk_text(self, text, chunk_size=500):
words = text.split()
return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
def answer_question(self, question):
if not self.active_document:
return "No document loaded. Please upload a PDF first."
question_embedding = self.model.encode(question)
similarities = cosine_similarity([question_embedding], self.embeddings)[0]
best_match_idx = np.argmax(similarities)
return self.text_chunks[best_match_idx]
# ---------------------
# Intent Classifier
# ---------------------
class IntentClassifier:
def __init__(self):
self.intents = {
"greet": ["hello", "hi", "hey"],
"bye": ["bye", "goodbye", "exit"],
"qa": ["what", "when", "how", "explain", "tell", "who", "why"],
"help": ["help", "support", "assist"]
}
def predict(self, tokens):
scores = defaultdict(int)
for token in tokens:
for intent, keywords in self.intents.items():
if token in keywords:
scores[intent] += 1
return max(scores, key=scores.get) if scores else "qa"
# ---------------------
# AI Agent Core
# ---------------------
class DocumentAI:
def __init__(self):
self.intent_recognizer = IntentClassifier()
self.qa_system = PDFQASystem()
self.responses = {
"greet": ["π Hello! I'm your document assistant.", "Hi there! Ready to answer your document questions."],
"bye": ["Goodbye!", "See you later!", "Thanks for using the assistant!"],
"help": "Upload a PDF and ask questions. Iβll answer from its content!",
"no_doc": "Please upload a PDF document first."
}
def handle_query(self, text):
tokens = tokenize(text)
intent = self.intent_recognizer.predict(tokens)
if intent == "greet":
return random.choice(self.responses["greet"])
elif intent == "bye":
return random.choice(self.responses["bye"])
elif intent == "help":
return self.responses["help"]
elif intent == "qa":
if self.qa_system.active_document:
return self.qa_system.answer_question(text)
else:
return self.responses["no_doc"]
else:
return "π€ Iβm not sure how to respond. Try saying 'help'."
# ---------------------
# Streamlit UI
# ---------------------
st.set_page_config(page_title="Document AI Assistant", page_icon="π")
st.title("π AI PDF Assistant")
st.markdown("Ask questions from uploaded PDF files!")
ai = DocumentAI()
uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
if uploaded_file:
ai.qa_system.process_pdf_stream(uploaded_file)
st.success(f"β
PDF '{uploaded_file.name}' processed successfully!")
query = st.text_input("Ask a question from the document:")
if query:
answer = ai.handle_query(query)
st.markdown(f"**π§ Answer:** {answer}") |