grant_rag_system

Sleeping

App Files Files Community

Tesneem commited on Jul 29

Commit

47b8e16

verified ·

1 Parent(s): 74e613f

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -12

app.py CHANGED Viewed

@@ -105,15 +105,30 @@ def init_vector_search() -> MongoDBAtlasVectorSearch:
         st.error("❌ Failed to connect to MongoDB Atlas manually")
         st.error(str(e))
         raise e
-# =================== PDF Processing ===================
-def extract_questions_from_text(text: str) -> List[str]:
-    # Match typical question formats
-    pattern = re.compile(r"((?:[A-Z][^\n]*?\?)|(?:[A-Z][^\n]{5,1000}?\n))", re.MULTILINE)
-    matches = pattern.findall(text)
-    # Filter out junk
-    questions = [q.strip() for q in matches if 10 < len(q) < 400]
-    return questions
 # =================== Format Retrieved Chunks ===================
 def format_docs(docs: List[Document]) -> str:
     return "\n\n".join(doc.page_content or doc.metadata.get("content", "") for doc in docs)
@@ -151,20 +166,41 @@ def main():
     st.set_page_config(page_title="Grant Buddy RAG", page_icon="🤖")
     st.title("🤖 Grant Buddy: Grant-Writing Assistant")
-    uploaded_file = st.file_uploader("Upload PDF or TXT for extra context (optional)", type=["pdf", "txt"])
     uploaded_text = ""
     if uploaded_file:
         if uploaded_file.name.endswith(".pdf"):
             reader = PdfReader(uploaded_file)
             uploaded_text = "\n".join([page.extract_text() for page in reader.pages])
-            questions = extract_questions_from_text(uploaded_text)
         elif uploaded_file.name.endswith(".txt"):
             uploaded_text = uploaded_file.read().decode("utf-8")
-            questions = extract_questions_from_text(uploaded_text)
     retriever = init_vector_search().as_retriever(search_kwargs={"k": 10, "score_threshold": 0.75})
     rag_chain = get_rag_chain(retriever)
     query = st.text_input("Ask a grant-related question")
     if st.button("Submit"):
         if not query:
@@ -184,6 +220,7 @@ def main():
                 st.markdown("---")
 if __name__ == "__main__":
     main()

         st.error("❌ Failed to connect to MongoDB Atlas manually")
         st.error(str(e))
         raise e
+# =================== Question/Headers Extraction ===================
+def extract_questions_and_headers(text: str) -> List[str]:
+    header_patterns = [
+        r'\d+\.\s+\*\*([^\*]+)\*\*',
+        r'\*\*([^*]+)\*\*',
+        r'^([A-Z][^a-z]*[A-Z])$',
+        r'^([A-Z][A-Za-z\s]{3,})$',
+        r'^[A-Z][A-Za-z\s]+:$'
+    ]
+    question_patterns = [
+        r'^.+\?$',
+        r'^\*?Please .+',
+        r'^How .+',
+        r'^What .+',
+        r'^Describe .+',
+    ]
+    combined_header_re = re.compile("|".join(header_patterns), re.MULTILINE)
+    combined_question_re = re.compile("|".join(question_patterns), re.MULTILINE)
+    headers = [match for group in combined_header_re.findall(text) for match in group if match]
+    questions = combined_question_re.findall(text)
+    return headers + questions
 # =================== Format Retrieved Chunks ===================
 def format_docs(docs: List[Document]) -> str:
     return "\n\n".join(doc.page_content or doc.metadata.get("content", "") for doc in docs)
     st.set_page_config(page_title="Grant Buddy RAG", page_icon="🤖")
     st.title("🤖 Grant Buddy: Grant-Writing Assistant")
     uploaded_text = ""
+    questions = []
+    # Upload Section
+    uploaded_file = st.file_uploader("Upload PDF or TXT for extra context (optional)", type=["pdf", "txt"])
     if uploaded_file:
         if uploaded_file.name.endswith(".pdf"):
             reader = PdfReader(uploaded_file)
             uploaded_text = "\n".join([page.extract_text() for page in reader.pages])
         elif uploaded_file.name.endswith(".txt"):
             uploaded_text = uploaded_file.read().decode("utf-8")
+        # ✅ Extract questions or headers from uploaded text
+        questions = extract_questions_and_headers(uploaded_text)
+        st.markdown("### 📄 Questions Extracted from Uploaded Document")
+    # Initialize RAG
     retriever = init_vector_search().as_retriever(search_kwargs={"k": 10, "score_threshold": 0.75})
     rag_chain = get_rag_chain(retriever)
+    # ✅ Batch Q&A from Uploaded File
+    if uploaded_text and questions:
+        answers = []
+        for q in questions:
+            full_query = f"{q}\n\nAdditional context:\n{uploaded_text}"
+            response = rag_chain.invoke(full_query)
+            answers.append({"question": q, "answer": response})
+        for item in answers:
+            st.markdown(f"### ❓ {item['question']}")
+            st.markdown(f"💬 {item['answer']}")
+    st.markdown("---")
+    # ✅ Manual Input
     query = st.text_input("Ask a grant-related question")
     if st.button("Submit"):
         if not query:
                 st.markdown("---")
 if __name__ == "__main__":
     main()