Tesneem commited on
Commit
a747844
·
verified ·
1 Parent(s): a067cdb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -23
app.py CHANGED
@@ -106,28 +106,44 @@ def init_vector_search() -> MongoDBAtlasVectorSearch:
106
  st.error(str(e))
107
  raise e
108
  # =================== Question/Headers Extraction ===================
109
- def extract_questions_and_headers(text: str) -> List[str]:
110
- header_patterns = [
111
- r'\d+\.\s+\*\*([^\*]+)\*\*',
112
- r'\*\*([^*]+)\*\*',
113
- r'^([A-Z][^a-z]*[A-Z])$',
114
- r'^([A-Z][A-Za-z\s]{3,})$',
115
- r'^[A-Z][A-Za-z\s]+:$'
116
- ]
117
- question_patterns = [
118
- r'^.+\?$',
119
- r'^\*?Please .+',
120
- r'^How .+',
121
- r'^What .+',
122
- r'^Describe .+',
123
- ]
124
- combined_header_re = re.compile("|".join(header_patterns), re.MULTILINE)
125
- combined_question_re = re.compile("|".join(question_patterns), re.MULTILINE)
126
-
127
- headers = [match for group in combined_header_re.findall(text) for match in group if match]
128
- questions = combined_question_re.findall(text)
129
-
130
- return headers + questions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  # =================== Format Retrieved Chunks ===================
133
  def format_docs(docs: List[Document]) -> str:
@@ -181,8 +197,10 @@ def main():
181
  elif uploaded_file.name.endswith(".txt"):
182
  uploaded_text = uploaded_file.read().decode("utf-8")
183
 
184
- questions = extract_questions_and_headers(uploaded_text)
185
  st.success(f"✅ Found {len(questions)} questions or headers.")
 
 
186
 
187
  # Generate answers
188
  answers = []
 
106
  st.error(str(e))
107
  raise e
108
  # =================== Question/Headers Extraction ===================
109
+ # def extract_questions_and_headers(text: str) -> List[str]:
110
+ # header_patterns = [
111
+ # r'\d+\.\s+\*\*([^\*]+)\*\*',
112
+ # r'\*\*([^*]+)\*\*',
113
+ # r'^([A-Z][^a-z]*[A-Z])$',
114
+ # r'^([A-Z][A-Za-z\s]{3,})$',
115
+ # r'^[A-Z][A-Za-z\s]+:$'
116
+ # ]
117
+ # question_patterns = [
118
+ # r'^.+\?$',
119
+ # r'^\*?Please .+',
120
+ # r'^How .+',
121
+ # r'^What .+',
122
+ # r'^Describe .+',
123
+ # ]
124
+ # combined_header_re = re.compile("|".join(header_patterns), re.MULTILINE)
125
+ # combined_question_re = re.compile("|".join(question_patterns), re.MULTILINE)
126
+
127
+ # headers = [match for group in combined_header_re.findall(text) for match in group if match]
128
+ # questions = combined_question_re.findall(text)
129
+
130
+ # return headers + questions
131
+ def extract_with_llm(text: str) -> List[str]:
132
+ client = InferenceClient(api_key=HF_TOKEN.strip())
133
+ prompt = (
134
+ "Extract a list of grant application headers and questions from the following text. "
135
+ "Include section titles, prompts, or any questions that ask for a response. "
136
+ "Return them as a numbered list.\n\n"
137
+ f"{text[:3000]}" # Optional: limit input to avoid token overflow
138
+ )
139
+ try:
140
+ response = client.text_generation(prompt=prompt, max_new_tokens=500)
141
+ return [line.strip("-•1234567890. ") for line in response.split("\n") if line.strip()]
142
+ except Exception as e:
143
+ st.error("❌ Failed to extract questions with LLM")
144
+ st.error(str(e))
145
+ return []
146
+
147
 
148
  # =================== Format Retrieved Chunks ===================
149
  def format_docs(docs: List[Document]) -> str:
 
197
  elif uploaded_file.name.endswith(".txt"):
198
  uploaded_text = uploaded_file.read().decode("utf-8")
199
 
200
+ questions = extract_with_llm(uploaded_text)
201
  st.success(f"✅ Found {len(questions)} questions or headers.")
202
+ with st.expander("🧠 Extracted Prompts from Upload"):
203
+ st.write(questions)
204
 
205
  # Generate answers
206
  answers = []