Spaces:

gaur3009
/

PDFQA

Sleeping

File size: 5,092 Bytes

import streamlit as st
import re
import random
import PyPDF2
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

import torch
from transformers import AutoTokenizer, AutoModel

# ---------------------
# Tokenization
# ---------------------
def tokenize(text):
    return re.findall(r"\w+", text.lower())

# ---------------------
# PDF QA System
# ---------------------
class PDFQASystem:
    def __init__(self):
        self.text_chunks = []
        self.embeddings = None
        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.model.eval()
        self.active_document = None

    def process_pdf_stream(self, uploaded_file):
        text = self._extract_pdf_text(uploaded_file)
        self.text_chunks = self._chunk_text(text)
        self.embeddings = self._embed(self.text_chunks)
        self.active_document = uploaded_file.name

    def _extract_pdf_text(self, uploaded_file):
        text = ""
        reader = PyPDF2.PdfReader(uploaded_file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
        return text

    def _chunk_text(self, text, chunk_size=500):
        words = text.split()
        return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, dim=1) / torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)

    def _embed(self, texts):
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            model_output = self.model(**inputs)
        embeddings = self._mean_pooling(model_output, inputs['attention_mask'])
        return torch.nn.functional.normalize(embeddings, p=2, dim=1).cpu().numpy()

    def answer_question(self, question):
        if not self.active_document:
            return "No document loaded. Please upload a PDF first."

        question_embedding = self._embed([question])[0]
        similarities = cosine_similarity([question_embedding], self.embeddings)[0]
        best_match_idx = np.argmax(similarities)
        return self.text_chunks[best_match_idx]

# ---------------------
# Intent Classifier
# ---------------------
class IntentClassifier:
    def __init__(self):
        self.intents = {
            "greet": ["hello", "hi", "hey"],
            "bye": ["bye", "goodbye", "exit"],
            "qa": ["what", "when", "how", "explain", "tell", "who", "why"],
            "help": ["help", "support", "assist"]
        }

    def predict(self, tokens):
        scores = defaultdict(int)
        for token in tokens:
            for intent, keywords in self.intents.items():
                if token in keywords:
                    scores[intent] += 1
        return max(scores, key=scores.get) if scores else "qa"

# ---------------------
# AI Agent Core
# ---------------------
class DocumentAI:
    def __init__(self):
        self.intent_recognizer = IntentClassifier()
        self.qa_system = PDFQASystem()
        self.responses = {
            "greet": ["👋 Hello! I'm your document assistant.", "Hi there! Ready to answer your document questions."],
            "bye": ["Goodbye!", "See you later!", "Thanks for using the assistant!"],
            "help": "Upload a PDF and ask questions. I’ll answer from its content!",
            "no_doc": "Please upload a PDF document first."
        }

    def handle_query(self, text):
        tokens = tokenize(text)
        intent = self.intent_recognizer.predict(tokens)

        if intent == "greet":
            return random.choice(self.responses["greet"])
        elif intent == "bye":
            return random.choice(self.responses["bye"])
        elif intent == "help":
            return self.responses["help"]
        elif intent == "qa":
            if self.qa_system.active_document:
                return self.qa_system.answer_question(text)
            else:
                return self.responses["no_doc"]
        else:
            return "🤖 I’m not sure how to respond. Try saying 'help'."

# ---------------------
# Streamlit UI
# ---------------------
st.set_page_config(page_title="Document AI Assistant", page_icon="📄")
st.title("📄 AI PDF Assistant")
st.markdown("Ask questions from uploaded PDF files!")

ai = DocumentAI()

uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

if uploaded_file:
    ai.qa_system.process_pdf_stream(uploaded_file)
    st.success(f"✅ PDF '{uploaded_file.name}' processed successfully!")

query = st.text_input("Ask a question from the document:")

if query:
    answer = ai.handle_query(query)
    st.markdown(f"**🧠 Answer:** {answer}")