Spaces:

gaur3009
/

PDFQA

Sleeping

File size: 4,254 Bytes

import streamlit as st
import re
import random
import PyPDF2
import numpy as np
from collections import defaultdict, deque
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# ---------------------
# Tokenization
# ---------------------
def tokenize(text):
    return re.findall(r"\w+", text.lower())

# ---------------------
# PDF QA System
# ---------------------
class PDFQASystem:
    def __init__(self):
        self.text_chunks = []
        self.embeddings = None
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.active_document = None
        
    def process_pdf_stream(self, uploaded_file):
        text = self._extract_pdf_text(uploaded_file)
        self.text_chunks = self._chunk_text(text)
        self.embeddings = self.model.encode(self.text_chunks)
        self.active_document = uploaded_file.name

    def _extract_pdf_text(self, uploaded_file):
        text = ""
        reader = PyPDF2.PdfReader(uploaded_file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
        return text

    def _chunk_text(self, text, chunk_size=500):
        words = text.split()
        return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

    def answer_question(self, question):
        if not self.active_document:
            return "No document loaded. Please upload a PDF first."
        
        question_embedding = self.model.encode(question)
        similarities = cosine_similarity([question_embedding], self.embeddings)[0]
        best_match_idx = np.argmax(similarities)
        return self.text_chunks[best_match_idx]

# ---------------------
# Intent Classifier
# ---------------------
class IntentClassifier:
    def __init__(self):
        self.intents = {
            "greet": ["hello", "hi", "hey"],
            "bye": ["bye", "goodbye", "exit"],
            "qa": ["what", "when", "how", "explain", "tell", "who", "why"],
            "help": ["help", "support", "assist"]
        }

    def predict(self, tokens):
        scores = defaultdict(int)
        for token in tokens:
            for intent, keywords in self.intents.items():
                if token in keywords:
                    scores[intent] += 1
        return max(scores, key=scores.get) if scores else "qa"

# ---------------------
# AI Agent Core
# ---------------------
class DocumentAI:
    def __init__(self):
        self.intent_recognizer = IntentClassifier()
        self.qa_system = PDFQASystem()
        self.responses = {
            "greet": ["👋 Hello! I'm your document assistant.", "Hi there! Ready to answer your document questions."],
            "bye": ["Goodbye!", "See you later!", "Thanks for using the assistant!"],
            "help": "Upload a PDF and ask questions. I’ll answer from its content!",
            "no_doc": "Please upload a PDF document first."
        }

    def handle_query(self, text):
        tokens = tokenize(text)
        intent = self.intent_recognizer.predict(tokens)

        if intent == "greet":
            return random.choice(self.responses["greet"])
        elif intent == "bye":
            return random.choice(self.responses["bye"])
        elif intent == "help":
            return self.responses["help"]
        elif intent == "qa":
            if self.qa_system.active_document:
                return self.qa_system.answer_question(text)
            else:
                return self.responses["no_doc"]
        else:
            return "🤖 I’m not sure how to respond. Try saying 'help'."

# ---------------------
# Streamlit UI
# ---------------------
st.set_page_config(page_title="Document AI Assistant", page_icon="📄")
st.title("📄 AI PDF Assistant")
st.markdown("Ask questions from uploaded PDF files!")

ai = DocumentAI()

uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

if uploaded_file:
    ai.qa_system.process_pdf_stream(uploaded_file)
    st.success(f"✅ PDF '{uploaded_file.name}' processed successfully!")

query = st.text_input("Ask a question from the document:")

if query:
    answer = ai.handle_query(query)
    st.markdown(f"**🧠 Answer:** {answer}")