File size: 5,092 Bytes
33eca15
323149c
 
 
 
03a8095
323149c
 
03a8095
 
 
323149c
 
 
 
 
 
 
 
 
 
 
 
 
03a8095
 
 
323149c
03a8095
323149c
 
 
03a8095
323149c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03a8095
 
 
 
 
 
 
 
 
 
 
 
323149c
 
 
03a8095
 
323149c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33eca15
323149c
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import streamlit as st
import re
import random
import PyPDF2
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

import torch
from transformers import AutoTokenizer, AutoModel

# ---------------------
# Tokenization
# ---------------------
def tokenize(text):
    return re.findall(r"\w+", text.lower())

# ---------------------
# PDF QA System
# ---------------------
class PDFQASystem:
    def __init__(self):
        self.text_chunks = []
        self.embeddings = None
        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.model.eval()
        self.active_document = None

    def process_pdf_stream(self, uploaded_file):
        text = self._extract_pdf_text(uploaded_file)
        self.text_chunks = self._chunk_text(text)
        self.embeddings = self._embed(self.text_chunks)
        self.active_document = uploaded_file.name

    def _extract_pdf_text(self, uploaded_file):
        text = ""
        reader = PyPDF2.PdfReader(uploaded_file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
        return text

    def _chunk_text(self, text, chunk_size=500):
        words = text.split()
        return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, dim=1) / torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)

    def _embed(self, texts):
        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            model_output = self.model(**inputs)
        embeddings = self._mean_pooling(model_output, inputs['attention_mask'])
        return torch.nn.functional.normalize(embeddings, p=2, dim=1).cpu().numpy()

    def answer_question(self, question):
        if not self.active_document:
            return "No document loaded. Please upload a PDF first."

        question_embedding = self._embed([question])[0]
        similarities = cosine_similarity([question_embedding], self.embeddings)[0]
        best_match_idx = np.argmax(similarities)
        return self.text_chunks[best_match_idx]

# ---------------------
# Intent Classifier
# ---------------------
class IntentClassifier:
    def __init__(self):
        self.intents = {
            "greet": ["hello", "hi", "hey"],
            "bye": ["bye", "goodbye", "exit"],
            "qa": ["what", "when", "how", "explain", "tell", "who", "why"],
            "help": ["help", "support", "assist"]
        }

    def predict(self, tokens):
        scores = defaultdict(int)
        for token in tokens:
            for intent, keywords in self.intents.items():
                if token in keywords:
                    scores[intent] += 1
        return max(scores, key=scores.get) if scores else "qa"

# ---------------------
# AI Agent Core
# ---------------------
class DocumentAI:
    def __init__(self):
        self.intent_recognizer = IntentClassifier()
        self.qa_system = PDFQASystem()
        self.responses = {
            "greet": ["πŸ‘‹ Hello! I'm your document assistant.", "Hi there! Ready to answer your document questions."],
            "bye": ["Goodbye!", "See you later!", "Thanks for using the assistant!"],
            "help": "Upload a PDF and ask questions. I’ll answer from its content!",
            "no_doc": "Please upload a PDF document first."
        }

    def handle_query(self, text):
        tokens = tokenize(text)
        intent = self.intent_recognizer.predict(tokens)

        if intent == "greet":
            return random.choice(self.responses["greet"])
        elif intent == "bye":
            return random.choice(self.responses["bye"])
        elif intent == "help":
            return self.responses["help"]
        elif intent == "qa":
            if self.qa_system.active_document:
                return self.qa_system.answer_question(text)
            else:
                return self.responses["no_doc"]
        else:
            return "πŸ€– I’m not sure how to respond. Try saying 'help'."

# ---------------------
# Streamlit UI
# ---------------------
st.set_page_config(page_title="Document AI Assistant", page_icon="πŸ“„")
st.title("πŸ“„ AI PDF Assistant")
st.markdown("Ask questions from uploaded PDF files!")

ai = DocumentAI()

uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

if uploaded_file:
    ai.qa_system.process_pdf_stream(uploaded_file)
    st.success(f"βœ… PDF '{uploaded_file.name}' processed successfully!")

query = st.text_input("Ask a question from the document:")

if query:
    answer = ai.handle_query(query)
    st.markdown(f"**🧠 Answer:** {answer}")