Tesneem commited on
Commit
47b8e16
Β·
verified Β·
1 Parent(s): 74e613f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -12
app.py CHANGED
@@ -105,15 +105,30 @@ def init_vector_search() -> MongoDBAtlasVectorSearch:
105
  st.error("❌ Failed to connect to MongoDB Atlas manually")
106
  st.error(str(e))
107
  raise e
108
- # =================== PDF Processing ===================
109
- def extract_questions_from_text(text: str) -> List[str]:
110
- # Match typical question formats
111
- pattern = re.compile(r"((?:[A-Z][^\n]*?\?)|(?:[A-Z][^\n]{5,1000}?\n))", re.MULTILINE)
112
- matches = pattern.findall(text)
113
-
114
- # Filter out junk
115
- questions = [q.strip() for q in matches if 10 < len(q) < 400]
116
- return questions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  # =================== Format Retrieved Chunks ===================
118
  def format_docs(docs: List[Document]) -> str:
119
  return "\n\n".join(doc.page_content or doc.metadata.get("content", "") for doc in docs)
@@ -151,20 +166,41 @@ def main():
151
  st.set_page_config(page_title="Grant Buddy RAG", page_icon="πŸ€–")
152
  st.title("πŸ€– Grant Buddy: Grant-Writing Assistant")
153
 
154
- uploaded_file = st.file_uploader("Upload PDF or TXT for extra context (optional)", type=["pdf", "txt"])
155
  uploaded_text = ""
 
 
 
 
156
  if uploaded_file:
157
  if uploaded_file.name.endswith(".pdf"):
158
  reader = PdfReader(uploaded_file)
159
  uploaded_text = "\n".join([page.extract_text() for page in reader.pages])
160
- questions = extract_questions_from_text(uploaded_text)
161
  elif uploaded_file.name.endswith(".txt"):
162
  uploaded_text = uploaded_file.read().decode("utf-8")
163
- questions = extract_questions_from_text(uploaded_text)
164
 
 
 
 
 
 
165
  retriever = init_vector_search().as_retriever(search_kwargs={"k": 10, "score_threshold": 0.75})
166
  rag_chain = get_rag_chain(retriever)
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  query = st.text_input("Ask a grant-related question")
169
  if st.button("Submit"):
170
  if not query:
@@ -184,6 +220,7 @@ def main():
184
  st.markdown("---")
185
 
186
 
 
187
  if __name__ == "__main__":
188
  main()
189
 
 
105
  st.error("❌ Failed to connect to MongoDB Atlas manually")
106
  st.error(str(e))
107
  raise e
108
+ # =================== Question/Headers Extraction ===================
109
+ def extract_questions_and_headers(text: str) -> List[str]:
110
+ header_patterns = [
111
+ r'\d+\.\s+\*\*([^\*]+)\*\*',
112
+ r'\*\*([^*]+)\*\*',
113
+ r'^([A-Z][^a-z]*[A-Z])$',
114
+ r'^([A-Z][A-Za-z\s]{3,})$',
115
+ r'^[A-Z][A-Za-z\s]+:$'
116
+ ]
117
+ question_patterns = [
118
+ r'^.+\?$',
119
+ r'^\*?Please .+',
120
+ r'^How .+',
121
+ r'^What .+',
122
+ r'^Describe .+',
123
+ ]
124
+ combined_header_re = re.compile("|".join(header_patterns), re.MULTILINE)
125
+ combined_question_re = re.compile("|".join(question_patterns), re.MULTILINE)
126
+
127
+ headers = [match for group in combined_header_re.findall(text) for match in group if match]
128
+ questions = combined_question_re.findall(text)
129
+
130
+ return headers + questions
131
+
132
  # =================== Format Retrieved Chunks ===================
133
  def format_docs(docs: List[Document]) -> str:
134
  return "\n\n".join(doc.page_content or doc.metadata.get("content", "") for doc in docs)
 
166
  st.set_page_config(page_title="Grant Buddy RAG", page_icon="πŸ€–")
167
  st.title("πŸ€– Grant Buddy: Grant-Writing Assistant")
168
 
 
169
  uploaded_text = ""
170
+ questions = []
171
+
172
+ # Upload Section
173
+ uploaded_file = st.file_uploader("Upload PDF or TXT for extra context (optional)", type=["pdf", "txt"])
174
  if uploaded_file:
175
  if uploaded_file.name.endswith(".pdf"):
176
  reader = PdfReader(uploaded_file)
177
  uploaded_text = "\n".join([page.extract_text() for page in reader.pages])
 
178
  elif uploaded_file.name.endswith(".txt"):
179
  uploaded_text = uploaded_file.read().decode("utf-8")
 
180
 
181
+ # βœ… Extract questions or headers from uploaded text
182
+ questions = extract_questions_and_headers(uploaded_text)
183
+ st.markdown("### πŸ“„ Questions Extracted from Uploaded Document")
184
+
185
+ # Initialize RAG
186
  retriever = init_vector_search().as_retriever(search_kwargs={"k": 10, "score_threshold": 0.75})
187
  rag_chain = get_rag_chain(retriever)
188
 
189
+ # βœ… Batch Q&A from Uploaded File
190
+ if uploaded_text and questions:
191
+ answers = []
192
+ for q in questions:
193
+ full_query = f"{q}\n\nAdditional context:\n{uploaded_text}"
194
+ response = rag_chain.invoke(full_query)
195
+ answers.append({"question": q, "answer": response})
196
+
197
+ for item in answers:
198
+ st.markdown(f"### ❓ {item['question']}")
199
+ st.markdown(f"πŸ’¬ {item['answer']}")
200
+
201
+ st.markdown("---")
202
+
203
+ # βœ… Manual Input
204
  query = st.text_input("Ask a grant-related question")
205
  if st.button("Submit"):
206
  if not query:
 
220
  st.markdown("---")
221
 
222
 
223
+
224
  if __name__ == "__main__":
225
  main()
226