Spaces:

gaur3009
/

PDFQA

Sleeping

App Files Files Community

gaur3009 commited on May 15

Commit

323149c

verified ·

1 Parent(s): 1faf467

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +123 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,125 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import re
+import random
+import PyPDF2
+import numpy as np
+from collections import defaultdict, deque
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+# ---------------------
+# Tokenization
+# ---------------------
+def tokenize(text):
+    return re.findall(r"\w+", text.lower())
+# ---------------------
+# PDF QA System
+# ---------------------
+class PDFQASystem:
+    def __init__(self):
+        self.text_chunks = []
+        self.embeddings = None
+        self.model = SentenceTransformer('all-MiniLM-L6-v2')
+        self.active_document = None
+    def process_pdf_stream(self, uploaded_file):
+        text = self._extract_pdf_text(uploaded_file)
+        self.text_chunks = self._chunk_text(text)
+        self.embeddings = self.model.encode(self.text_chunks)
+        self.active_document = uploaded_file.name
+    def _extract_pdf_text(self, uploaded_file):
+        text = ""
+        reader = PyPDF2.PdfReader(uploaded_file)
+        for page in reader.pages:
+            page_text = page.extract_text()
+            if page_text:
+                text += page_text
+        return text
+    def _chunk_text(self, text, chunk_size=500):
+        words = text.split()
+        return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
+    def answer_question(self, question):
+        if not self.active_document:
+            return "No document loaded. Please upload a PDF first."
+        question_embedding = self.model.encode(question)
+        similarities = cosine_similarity([question_embedding], self.embeddings)[0]
+        best_match_idx = np.argmax(similarities)
+        return self.text_chunks[best_match_idx]
+# ---------------------
+# Intent Classifier
+# ---------------------
+class IntentClassifier:
+    def __init__(self):
+        self.intents = {
+            "greet": ["hello", "hi", "hey"],
+            "bye": ["bye", "goodbye", "exit"],
+            "qa": ["what", "when", "how", "explain", "tell", "who", "why"],
+            "help": ["help", "support", "assist"]
+        }
+    def predict(self, tokens):
+        scores = defaultdict(int)
+        for token in tokens:
+            for intent, keywords in self.intents.items():
+                if token in keywords:
+                    scores[intent] += 1
+        return max(scores, key=scores.get) if scores else "qa"
+# ---------------------
+# AI Agent Core
+# ---------------------
+class DocumentAI:
+    def __init__(self):
+        self.intent_recognizer = IntentClassifier()
+        self.qa_system = PDFQASystem()
+        self.responses = {
+            "greet": ["👋 Hello! I'm your document assistant.", "Hi there! Ready to answer your document questions."],
+            "bye": ["Goodbye!", "See you later!", "Thanks for using the assistant!"],
+            "help": "Upload a PDF and ask questions. I’ll answer from its content!",
+            "no_doc": "Please upload a PDF document first."
+        }
+    def handle_query(self, text):
+        tokens = tokenize(text)
+        intent = self.intent_recognizer.predict(tokens)
+        if intent == "greet":
+            return random.choice(self.responses["greet"])
+        elif intent == "bye":
+            return random.choice(self.responses["bye"])
+        elif intent == "help":
+            return self.responses["help"]
+        elif intent == "qa":
+            if self.qa_system.active_document:
+                return self.qa_system.answer_question(text)
+            else:
+                return self.responses["no_doc"]
+        else:
+            return "🤖 I’m not sure how to respond. Try saying 'help'."
+# ---------------------
+# Streamlit UI
+# ---------------------
+st.set_page_config(page_title="Document AI Assistant", page_icon="📄")
+st.title("📄 AI PDF Assistant")
+st.markdown("Ask questions from uploaded PDF files!")
+ai = DocumentAI()
+uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
+if uploaded_file:
+    ai.qa_system.process_pdf_stream(uploaded_file)
+    st.success(f"✅ PDF '{uploaded_file.name}' processed successfully!")
+query = st.text_input("Ask a question from the document:")
+if query:
+    answer = ai.handle_query(query)
+    st.markdown(f"**🧠 Answer:** {answer}")