Spaces:

gaur3009
/

PDFQA

Sleeping

App Files Files Community

gaur3009 commited on May 16

Commit

1b89b73

verified ·

1 Parent(s): 2af71a4

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +73 -131

src/streamlit_app.py CHANGED Viewed

@@ -1,141 +1,83 @@
 import streamlit as st
-import re
-import random
 import PyPDF2
-import numpy as np
-from collections import defaultdict
 from sklearn.metrics.pairwise import cosine_similarity
-import torch
-from transformers import AutoTokenizer, AutoModel
-# ---------------------
-# Tokenization
-# ---------------------
-def tokenize(text):
-    return re.findall(r"\w+", text.lower())
-# ---------------------
-# PDF QA System
-# ---------------------
-class PDFQASystem:
-    def __init__(self):
-        self.text_chunks = []
-        self.embeddings = None
-        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
-        self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
-        self.model.eval()
-        self.active_document = None
-    def process_pdf_stream(self, uploaded_file):
-        text = self._extract_pdf_text(uploaded_file)
-        self.text_chunks = self._chunk_text(text)
-        self.embeddings = self._embed(self.text_chunks)
-        self.active_document = uploaded_file.name
-    def _extract_pdf_text(self, uploaded_file):
-        text = ""
-        reader = PyPDF2.PdfReader(uploaded_file)
-        for page in reader.pages:
-            page_text = page.extract_text()
-            if page_text:
-                text += page_text
-        return text
-    def _chunk_text(self, text, chunk_size=500):
-        words = text.split()
-        return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
-    def _mean_pooling(self, model_output, attention_mask):
-        token_embeddings = model_output.last_hidden_state
-        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-        return torch.sum(token_embeddings * input_mask_expanded, dim=1) / torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
-    def _embed(self, texts):
-        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
-        with torch.no_grad():
-            model_output = self.model(**inputs)
-        embeddings = self._mean_pooling(model_output, inputs['attention_mask'])
-        return torch.nn.functional.normalize(embeddings, p=2, dim=1).cpu().numpy()
-    def answer_question(self, question):
-        if not self.active_document:
-            return "No document loaded. Please upload a PDF first."
-        question_embedding = self._embed([question])[0]
-        similarities = cosine_similarity([question_embedding], self.embeddings)[0]
-        best_match_idx = np.argmax(similarities)
-        return self.text_chunks[best_match_idx]
-# ---------------------
-# Intent Classifier
-# ---------------------
-class IntentClassifier:
-    def __init__(self):
-        self.intents = {
-            "greet": ["hello", "hi", "hey"],
-            "bye": ["bye", "goodbye", "exit"],
-            "qa": ["what", "when", "how", "explain", "tell", "who", "why"],
-            "help": ["help", "support", "assist"]
-        }
-    def predict(self, tokens):
-        scores = defaultdict(int)
-        for token in tokens:
-            for intent, keywords in self.intents.items():
-                if token in keywords:
-                    scores[intent] += 1
-        return max(scores, key=scores.get) if scores else "qa"
-# ---------------------
-# AI Agent Core
-# ---------------------
-class DocumentAI:
-    def __init__(self):
-        self.intent_recognizer = IntentClassifier()
-        self.qa_system = PDFQASystem()
-        self.responses = {
-            "greet": ["👋 Hello! I'm your document assistant.", "Hi there! Ready to answer your document questions."],
-            "bye": ["Goodbye!", "See you later!", "Thanks for using the assistant!"],
-            "help": "Upload a PDF and ask questions. I’ll answer from its content!",
-            "no_doc": "Please upload a PDF document first."
-        }
-    def handle_query(self, text):
-        tokens = tokenize(text)
-        intent = self.intent_recognizer.predict(tokens)
-        if intent == "greet":
-            return random.choice(self.responses["greet"])
-        elif intent == "bye":
-            return random.choice(self.responses["bye"])
-        elif intent == "help":
-            return self.responses["help"]
-        elif intent == "qa":
-            if self.qa_system.active_document:
-                return self.qa_system.answer_question(text)
-            else:
-                return self.responses["no_doc"]
-        else:
-            return "🤖 I’m not sure how to respond. Try saying 'help'."
-# ---------------------
 # Streamlit UI
-# ---------------------
-st.set_page_config(page_title="Document AI Assistant", page_icon="📄")
-st.title("📄 AI PDF Assistant")
-st.markdown("Ask questions from uploaded PDF files!")
-ai = DocumentAI()
-uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
-if uploaded_file:
-    ai.qa_system.process_pdf_stream(uploaded_file)
-    st.success(f"✅ PDF '{uploaded_file.name}' processed successfully!")
-query = st.text_input("Ask a question from the document:")
-if query:
-    answer = ai.handle_query(query)
-    st.markdown(f"**🧠 Answer:** {answer}")

 import streamlit as st
 import PyPDF2
+import torch
+from transformers import AutoTokenizer, AutoModel, pipeline
 from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import tempfile
+# Load local models once
+@st.cache_resource
+def load_models():
+    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+    qa_pipeline_model = pipeline("text2text-generation", model="google/flan-t5-base")
+    return tokenizer, model, qa_pipeline_model
+embedding_tokenizer, embedding_model, qa_pipeline_model = load_models()
+# PDF loader
+def load_pdf(file):
+    reader = PyPDF2.PdfReader(file)
+    text = ''
+    for page in reader.pages:
+        text += page.extract_text() or ''
+    return text
+# Embed text
+def get_embedding(text):
+    inputs = embedding_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+    with torch.no_grad():
+        model_output = embedding_model(**inputs)
+    return model_output.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
+# Store vectors in-memory
+vector_store = []
+def upload_document_chunks(chunks):
+    vector_store.clear()
+    for chunk in chunks:
+        embedding = get_embedding(chunk)
+        vector_store.append((chunk, embedding))
+def query_answer(query):
+    query_vec = get_embedding(query)
+    similarities = [cosine_similarity([query_vec], [vec])[0][0] for _, vec in vector_store]
+    top_indices = np.argsort(similarities)[-3:][::-1]
+    return [vector_store[i][0] for i in top_indices]
+def generate_response(context, query):
+    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
+    response = qa_pipeline_model(prompt, max_new_tokens=100, do_sample=True)
+    return response[0]['generated_text'].strip()
 # Streamlit UI
+st.set_page_config(page_title="Offline PDF QA Bot", layout="centered")
+st.title("📄 Offline PDF QA Bot 🔍")
+st.markdown(
+    "Upload a PDF document, ask a question, and get an answer using **only local models** — no external APIs involved."
+)
+uploaded_file = st.file_uploader("📁 Upload PDF", type="pdf")
+user_query = st.text_input("❓ Ask a question based on the document")
+if uploaded_file and user_query:
+    with st.spinner("Processing..."):
+        with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
+            tmp_file.write(uploaded_file.read())
+            document_text = load_pdf(tmp_file.name)
+        document_chunks = [document_text[i:i + 500] for i in range(0, len(document_text), 500)]
+        upload_document_chunks(document_chunks)
+        top_chunks = query_answer(user_query)
+        context = " ".join(top_chunks)
+        answer = generate_response(context, user_query)
+    st.subheader("📜 Retrieved Document Segments")
+    for i, chunk in enumerate(top_chunks, 1):
+        st.markdown(f"**Chunk {i}:** {chunk}")
+    st.subheader("💬 Answer")
+    st.success(answer)