grant_rag_system

Running

App Files Files Community

Tesneem commited on Jul 29

Commit

a747844

verified ·

1 Parent(s): a067cdb

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -23

app.py CHANGED Viewed

@@ -106,28 +106,44 @@ def init_vector_search() -> MongoDBAtlasVectorSearch:
         st.error(str(e))
         raise e
 # =================== Question/Headers Extraction ===================
-def extract_questions_and_headers(text: str) -> List[str]:
-    header_patterns = [
-        r'\d+\.\s+\*\*([^\*]+)\*\*',
-        r'\*\*([^*]+)\*\*',
-        r'^([A-Z][^a-z]*[A-Z])$',
-        r'^([A-Z][A-Za-z\s]{3,})$',
-        r'^[A-Z][A-Za-z\s]+:$'
-    ]
-    question_patterns = [
-        r'^.+\?$',
-        r'^\*?Please .+',
-        r'^How .+',
-        r'^What .+',
-        r'^Describe .+',
-    ]
-    combined_header_re = re.compile("|".join(header_patterns), re.MULTILINE)
-    combined_question_re = re.compile("|".join(question_patterns), re.MULTILINE)
-    headers = [match for group in combined_header_re.findall(text) for match in group if match]
-    questions = combined_question_re.findall(text)
-    return headers + questions
 # =================== Format Retrieved Chunks ===================
 def format_docs(docs: List[Document]) -> str:
@@ -181,8 +197,10 @@ def main():
             elif uploaded_file.name.endswith(".txt"):
                 uploaded_text = uploaded_file.read().decode("utf-8")
-            questions = extract_questions_and_headers(uploaded_text)
             st.success(f"✅ Found {len(questions)} questions or headers.")
             # Generate answers
             answers = []

         st.error(str(e))
         raise e
 # =================== Question/Headers Extraction ===================
+# def extract_questions_and_headers(text: str) -> List[str]:
+#     header_patterns = [
+#         r'\d+\.\s+\*\*([^\*]+)\*\*',
+#         r'\*\*([^*]+)\*\*',
+#         r'^([A-Z][^a-z]*[A-Z])$',
+#         r'^([A-Z][A-Za-z\s]{3,})$',
+#         r'^[A-Z][A-Za-z\s]+:$'
+#     ]
+#     question_patterns = [
+#         r'^.+\?$',
+#         r'^\*?Please .+',
+#         r'^How .+',
+#         r'^What .+',
+#         r'^Describe .+',
+#     ]
+#     combined_header_re = re.compile("|".join(header_patterns), re.MULTILINE)
+#     combined_question_re = re.compile("|".join(question_patterns), re.MULTILINE)
+#     headers = [match for group in combined_header_re.findall(text) for match in group if match]
+#     questions = combined_question_re.findall(text)
+#     return headers + questions
+def extract_with_llm(text: str) -> List[str]:
+    client = InferenceClient(api_key=HF_TOKEN.strip())
+    prompt = (
+        "Extract a list of grant application headers and questions from the following text. "
+        "Include section titles, prompts, or any questions that ask for a response. "
+        "Return them as a numbered list.\n\n"
+        f"{text[:3000]}"  # Optional: limit input to avoid token overflow
+    )
+    try:
+        response = client.text_generation(prompt=prompt, max_new_tokens=500)
+        return [line.strip("-•1234567890. ") for line in response.split("\n") if line.strip()]
+    except Exception as e:
+        st.error("❌ Failed to extract questions with LLM")
+        st.error(str(e))
+        return []
 # =================== Format Retrieved Chunks ===================
 def format_docs(docs: List[Document]) -> str:
             elif uploaded_file.name.endswith(".txt"):
                 uploaded_text = uploaded_file.read().decode("utf-8")
+            questions = extract_with_llm(uploaded_text)
             st.success(f"✅ Found {len(questions)} questions or headers.")
+            with st.expander("🧠 Extracted Prompts from Upload"):
+                st.write(questions)
             # Generate answers
             answers = []