Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -106,28 +106,44 @@ def init_vector_search() -> MongoDBAtlasVectorSearch:
|
|
106 |
st.error(str(e))
|
107 |
raise e
|
108 |
# =================== Question/Headers Extraction ===================
|
109 |
-
def extract_questions_and_headers(text: str) -> List[str]:
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
# =================== Format Retrieved Chunks ===================
|
133 |
def format_docs(docs: List[Document]) -> str:
|
@@ -181,8 +197,10 @@ def main():
|
|
181 |
elif uploaded_file.name.endswith(".txt"):
|
182 |
uploaded_text = uploaded_file.read().decode("utf-8")
|
183 |
|
184 |
-
questions =
|
185 |
st.success(f"✅ Found {len(questions)} questions or headers.")
|
|
|
|
|
186 |
|
187 |
# Generate answers
|
188 |
answers = []
|
|
|
106 |
st.error(str(e))
|
107 |
raise e
|
108 |
# =================== Question/Headers Extraction ===================
|
109 |
+
# def extract_questions_and_headers(text: str) -> List[str]:
|
110 |
+
# header_patterns = [
|
111 |
+
# r'\d+\.\s+\*\*([^\*]+)\*\*',
|
112 |
+
# r'\*\*([^*]+)\*\*',
|
113 |
+
# r'^([A-Z][^a-z]*[A-Z])$',
|
114 |
+
# r'^([A-Z][A-Za-z\s]{3,})$',
|
115 |
+
# r'^[A-Z][A-Za-z\s]+:$'
|
116 |
+
# ]
|
117 |
+
# question_patterns = [
|
118 |
+
# r'^.+\?$',
|
119 |
+
# r'^\*?Please .+',
|
120 |
+
# r'^How .+',
|
121 |
+
# r'^What .+',
|
122 |
+
# r'^Describe .+',
|
123 |
+
# ]
|
124 |
+
# combined_header_re = re.compile("|".join(header_patterns), re.MULTILINE)
|
125 |
+
# combined_question_re = re.compile("|".join(question_patterns), re.MULTILINE)
|
126 |
+
|
127 |
+
# headers = [match for group in combined_header_re.findall(text) for match in group if match]
|
128 |
+
# questions = combined_question_re.findall(text)
|
129 |
+
|
130 |
+
# return headers + questions
|
131 |
+
def extract_with_llm(text: str) -> List[str]:
|
132 |
+
client = InferenceClient(api_key=HF_TOKEN.strip())
|
133 |
+
prompt = (
|
134 |
+
"Extract a list of grant application headers and questions from the following text. "
|
135 |
+
"Include section titles, prompts, or any questions that ask for a response. "
|
136 |
+
"Return them as a numbered list.\n\n"
|
137 |
+
f"{text[:3000]}" # Optional: limit input to avoid token overflow
|
138 |
+
)
|
139 |
+
try:
|
140 |
+
response = client.text_generation(prompt=prompt, max_new_tokens=500)
|
141 |
+
return [line.strip("-•1234567890. ") for line in response.split("\n") if line.strip()]
|
142 |
+
except Exception as e:
|
143 |
+
st.error("❌ Failed to extract questions with LLM")
|
144 |
+
st.error(str(e))
|
145 |
+
return []
|
146 |
+
|
147 |
|
148 |
# =================== Format Retrieved Chunks ===================
|
149 |
def format_docs(docs: List[Document]) -> str:
|
|
|
197 |
elif uploaded_file.name.endswith(".txt"):
|
198 |
uploaded_text = uploaded_file.read().decode("utf-8")
|
199 |
|
200 |
+
questions = extract_with_llm(uploaded_text)
|
201 |
st.success(f"✅ Found {len(questions)} questions or headers.")
|
202 |
+
with st.expander("🧠 Extracted Prompts from Upload"):
|
203 |
+
st.write(questions)
|
204 |
|
205 |
# Generate answers
|
206 |
answers = []
|