brainsqueeze commited on
Commit
68e9b80
·
verified ·
1 Parent(s): aed4c76

Update issuelab query technique

Browse files

* quasi-sparse vector approach for issuelab
* summarize issuelab articles prior to LLM handoff
* pin transformers version to avoid major bug

ask_candid/base/retrieval/elastic.py CHANGED
@@ -21,14 +21,14 @@ def build_sparse_vector_query(
21
  ----------
22
  query : str
23
  Search context string
24
- fields : Tuple[str, ...]
25
  Semantic text field names
26
  inference_id : str, optional
27
  ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
28
 
29
  Returns
30
  -------
31
- Dict[str, Any]
32
  """
33
 
34
  output = []
@@ -70,20 +70,20 @@ def build_sparse_vector_and_text_query(
70
  ----------
71
  query : str
72
  Search context string
73
- semantic_fields : Tuple[str]
74
  Semantic text field names
75
- highlight_fields: Tuple[str]
76
  Fields which relevant chunks will be helpful for the agent to read
77
- text_fields : Tuple[str]
78
  Regular text fields
79
- excluded_fields : Tuple[str]
80
  Fields to exclude from the source
81
  inference_id : str, optional
82
  ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
83
 
84
  Returns
85
  -------
86
- Dict[str, Any]
87
  """
88
 
89
  output = []
@@ -149,7 +149,7 @@ def news_query_builder(
149
 
150
  Returns
151
  -------
152
- Dict[str, Any]
153
  """
154
 
155
  tokens = encoder.token_expand(query)
@@ -180,11 +180,79 @@ def news_query_builder(
180
  return elastic_query
181
 
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  def multi_search_base(
184
  queries: list[dict[str, Any]],
185
  credentials: BaseElasticSearchConnection | BaseElasticAPIKeyCredential,
186
  timeout: int = 180
187
  ) -> Iterator[dict[str, Any]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  if isinstance(credentials, BaseElasticAPIKeyCredential):
189
  es = Elasticsearch(
190
  cloud_id=credentials.cloud_id,
 
21
  ----------
22
  query : str
23
  Search context string
24
+ fields : tuple[str, ...]
25
  Semantic text field names
26
  inference_id : str, optional
27
  ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
28
 
29
  Returns
30
  -------
31
+ dict[str, Any]
32
  """
33
 
34
  output = []
 
70
  ----------
71
  query : str
72
  Search context string
73
+ semantic_fields : tuple[str]
74
  Semantic text field names
75
+ highlight_fields: tuple[str]
76
  Fields which relevant chunks will be helpful for the agent to read
77
+ text_fields : tuple[str]
78
  Regular text fields
79
+ excluded_fields : tuple[str]
80
  Fields to exclude from the source
81
  inference_id : str, optional
82
  ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
83
 
84
  Returns
85
  -------
86
+ dict[str, Any]
87
  """
88
 
89
  output = []
 
149
 
150
  Returns
151
  -------
152
+ dict[str, Any]
153
  """
154
 
155
  tokens = encoder.token_expand(query)
 
180
  return elastic_query
181
 
182
 
183
+ def issuelab_query_builder(
184
+ query: str,
185
+ fields: tuple[str, ...],
186
+ highlight_fields: tuple[str, ...] | None,
187
+ encoder: SpladeEncoder,
188
+ ) -> dict[str, Any]:
189
+ """Builds a valid Elasticsearch query against Issuelab, simulating a token expansion.
190
+
191
+ Parameters
192
+ ----------
193
+ query : str
194
+ Search context string
195
+
196
+ Returns
197
+ -------
198
+ dict[str, Any]
199
+ """
200
+
201
+ tokens = encoder.token_expand(query)
202
+
203
+ elastic_query = {
204
+ "_source": ["issuelab_id", "issuelab_url", "title", "description", "content"],
205
+ "query": {
206
+ "bool": {
207
+ # "filter": [
208
+ # # {"range": {"event_date": {"gte": f"now-{days_ago}d/d"}}},
209
+ # # {"range": {"insert_date": {"gte": f"now-{days_ago}d/d"}}},
210
+ # # {"range": {"article_trust_worthiness": {"gt": NEWS_TRUST_SCORE_THRESHOLD}}}
211
+ # ],
212
+ "should": []
213
+ }
214
+ },
215
+ "highlight": {
216
+ "fields": dict.fromkeys(highlight_fields or ("content", "description"), {})
217
+ }
218
+ }
219
+
220
+ for token, score in tokens.items():
221
+ if score > SPARSE_ENCODING_SCORE_THRESHOLD:
222
+ elastic_query["query"]["bool"]["should"].append({
223
+ "multi_match": {
224
+ "query": token,
225
+ "fields": fields,
226
+ "boost": score
227
+ }
228
+ })
229
+ return elastic_query
230
+
231
+
232
  def multi_search_base(
233
  queries: list[dict[str, Any]],
234
  credentials: BaseElasticSearchConnection | BaseElasticAPIKeyCredential,
235
  timeout: int = 180
236
  ) -> Iterator[dict[str, Any]]:
237
+ """Handles multi-search queries on a single cluster given the relevant credetials object
238
+
239
+ Parameters
240
+ ----------
241
+ queries : list[dict[str, Any]]
242
+ `msearch` query object, (see: https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-msearch)
243
+ credentials : BaseElasticSearchConnection | BaseElasticAPIKeyCredential
244
+ timeout : int, optional, by default 180
245
+
246
+ Yields
247
+ ------
248
+ Iterator[dict[str, Any]]
249
+
250
+ Raises
251
+ ------
252
+ TypeError
253
+ Raised if invalid credentials are passed
254
+ """
255
+
256
  if isinstance(credentials, BaseElasticAPIKeyCredential):
257
  es = Elasticsearch(
258
  cloud_id=credentials.cloud_id,
ask_candid/base/retrieval/sources.py CHANGED
@@ -25,9 +25,16 @@ CandidNewsConfig = ElasticSourceConfig(
25
  )
26
 
27
 
 
 
 
 
 
28
  IssueLabConfig = ElasticSourceConfig(
29
- index_name="search-semantic-issuelab-elser_ve2",
30
- semantic_fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
 
 
31
  )
32
 
33
 
 
25
  )
26
 
27
 
28
+ # IssueLabConfig = ElasticSourceConfig(
29
+ # index_name="search-semantic-issuelab-elser_ve2",
30
+ # semantic_fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
31
+ # )
32
+
33
  IssueLabConfig = ElasticSourceConfig(
34
+ index_name="issuelab_prod_data",
35
+ # semantic_fields=("title", "description", "content"),
36
+ semantic_fields=("title", "description", "content^0.3"),
37
+ highlight_fields=("description", "content")
38
  )
39
 
40
 
ask_candid/services/knowledge_base.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal, Any
2
+ from collections.abc import Iterator, Iterable
3
+ from itertools import groupby
4
+ import logging
5
+
6
+ from langchain_core.documents import Document
7
+
8
+ from ask_candid.base.retrieval.elastic import (
9
+ build_sparse_vector_query,
10
+ build_sparse_vector_and_text_query,
11
+ news_query_builder,
12
+ issuelab_query_builder,
13
+ multi_search_base
14
+ )
15
+ from ask_candid.base.retrieval.sparse_lexical import SpladeEncoder
16
+ from ask_candid.base.retrieval.schemas import ElasticHitsResult
17
+ import ask_candid.base.retrieval.sources as S
18
+ from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA, NEWS_ELASTIC
19
+ from ask_candid.services.small_lm import CandidSmallLanguageModel
20
+
21
+ SourceNames = Literal[
22
+ "Candid Blog",
23
+ "Candid Help",
24
+ "Candid Learning",
25
+ "Candid News",
26
+ "IssueLab Research Reports",
27
+ "YouTube Training"
28
+ ]
29
+ sparse_encoder = SpladeEncoder()
30
+ logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
31
+ logger = logging.getLogger(__name__)
32
+ logger.setLevel(logging.INFO)
33
+
34
+
35
+ # TODO remove
36
+ def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
37
+ """Pads the relevant chunk of text with context before and after
38
+
39
+ Parameters
40
+ ----------
41
+ field_name : str
42
+ a field with the long text that was chunked into pieces
43
+ hit : ElasticHitsResult
44
+ context_length : int, optional
45
+ length of text to add before and after the chunk, by default 1024
46
+ add_context : bool, optional
47
+ Set to `False` to expand the text context by searching for the Elastic inner hit inside the larger document
48
+ , by default True
49
+
50
+ Returns
51
+ -------
52
+ str
53
+ longer chunks stuffed together
54
+ """
55
+
56
+ chunks = []
57
+ # NOTE chunks have tokens, long text is a string, but may contain html which affects tokenization
58
+ long_text = hit.source.get(field_name) or ""
59
+ long_text = long_text.lower()
60
+
61
+ inner_hits_field = f"embeddings.{field_name}.chunks"
62
+ found_chunks = hit.inner_hits.get(inner_hits_field, {}) if hit.inner_hits else None
63
+ if found_chunks:
64
+ for h in found_chunks.get("hits", {}).get("hits") or []:
65
+ chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]
66
+
67
+ # cutting the middle because we may have tokenizing artifacts there
68
+ chunk = chunk[3: -3]
69
+
70
+ if add_context:
71
+ # Find the start and end indices of the chunk in the large text
72
+ start_index = long_text.find(chunk[:20])
73
+
74
+ # Chunk is found
75
+ if start_index != -1:
76
+ end_index = start_index + len(chunk)
77
+ pre_start_index = max(0, start_index - context_length)
78
+ post_end_index = min(len(long_text), end_index + context_length)
79
+ chunks.append(long_text[pre_start_index:post_end_index])
80
+ else:
81
+ chunks.append(chunk)
82
+ return '\n\n'.join(chunks)
83
+
84
+
85
+ def generate_queries(
86
+ query: str,
87
+ sources: list[SourceNames],
88
+ news_days_ago: int = 60
89
+ ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
90
+ """Builds Elastic queries against indices which do or do not support sparse vector queries.
91
+
92
+ Parameters
93
+ ----------
94
+ query : str
95
+ Text describing a user's question or a description of investigative work which requires support from Candid's
96
+ knowledge base
97
+ sources : list[SourceNames]
98
+ One or more sources of knowledge from different areas at Candid.
99
+ * Candid Blog: Blog posts from Candid staff and trusted partners intended to help those in the sector or
100
+ illuminate ongoing work
101
+ * Candid Help: Candid FAQs to help user's get started with Candid's product platform and learning resources
102
+ * Candid Learning: Training documents from Candid's subject matter experts
103
+ * Candid News: News articles and press releases about real-time activity in the philanthropic sector
104
+ * IssueLab Research Reports: Academic research reports about the social/philanthropic sector
105
+ * YouTube Training: Transcripts from video-based training seminars from Candid's subject matter experts
106
+ news_days_ago : int, optional
107
+ How many days in the past to search for news articles, if a user is asking for recent trends then this value
108
+ should be set lower >~ 10, by default 60
109
+
110
+ Returns
111
+ -------
112
+ tuple[list[dict[str, Any]], list[dict[str, Any]]]
113
+ (sparse vector queries, queries for indices which do not support sparse vectors)
114
+ """
115
+
116
+ vector_queries = []
117
+ quasi_vector_queries = []
118
+
119
+ for source_name in sources:
120
+ if source_name == "Candid Blog":
121
+ q = build_sparse_vector_query(query=query, fields=S.CandidBlogConfig.semantic_fields)
122
+ q["_source"] = {"excludes": ["embeddings"]}
123
+ q["size"] = 5
124
+ vector_queries.extend([{"index": S.CandidBlogConfig.index_name}, q])
125
+ elif source_name == "Candid Help":
126
+ q = build_sparse_vector_query(query=query, fields=S.CandidHelpConfig.semantic_fields)
127
+ q["_source"] = {"excludes": ["embeddings"]}
128
+ q["size"] = 5
129
+ vector_queries.extend([{"index": S.CandidHelpConfig.index_name}, q])
130
+ elif source_name == "Candid Learning":
131
+ q = build_sparse_vector_query(query=query, fields=S.CandidLearningConfig.semantic_fields)
132
+ q["_source"] = {"excludes": ["embeddings"]}
133
+ q["size"] = 5
134
+ vector_queries.extend([{"index": S.CandidLearningConfig.index_name}, q])
135
+ elif source_name == "Candid News":
136
+ q = news_query_builder(
137
+ query=query,
138
+ fields=S.CandidNewsConfig.semantic_fields,
139
+ encoder=sparse_encoder,
140
+ days_ago=news_days_ago
141
+ )
142
+ q["size"] = 5
143
+ quasi_vector_queries.extend([{"index": S.CandidNewsConfig.index_name}, q])
144
+ elif source_name == "IssueLab Research Reports":
145
+ # q = build_sparse_vector_query(query=query, fields=S.IssueLabConfig.semantic_fields)
146
+ # q["_source"] = {"excludes": ["embeddings"]}
147
+ # q["size"] = 1
148
+ # vector_queries.extend([{"index": S.IssueLabConfig.index_name}, q])
149
+
150
+ q = issuelab_query_builder(
151
+ query=query,
152
+ fields=S.IssueLabConfig.semantic_fields,
153
+ highlight_fields=S.IssueLabConfig.highlight_fields,
154
+ encoder=sparse_encoder,
155
+ )
156
+ q["size"] = 1
157
+ quasi_vector_queries.extend([{"index": S.IssueLabConfig.index_name}, q])
158
+ elif source_name == "YouTube Training":
159
+ q = build_sparse_vector_and_text_query(
160
+ query=query,
161
+ semantic_fields=S.YoutubeConfig.semantic_fields,
162
+ text_fields=S.YoutubeConfig.text_fields,
163
+ highlight_fields=S.YoutubeConfig.highlight_fields,
164
+ excluded_fields=S.YoutubeConfig.excluded_fields
165
+ )
166
+ q["size"] = 5
167
+ vector_queries.extend([{"index": S.YoutubeConfig.index_name}, q])
168
+
169
+ return vector_queries, quasi_vector_queries
170
+
171
+
172
+ def run_search(
173
+ vector_searches: list[dict[str, Any]] | None = None,
174
+ non_vector_searches: list[dict[str, Any]] | None = None,
175
+ ) -> list[ElasticHitsResult]:
176
+ """Elastic query runner which executes both sparse vector, and quasi-sparse vector queries and concatenates results.
177
+ This does not include re-ranking.
178
+
179
+ Parameters
180
+ ----------
181
+ vector_searches : list[dict[str, Any]] | None, optional
182
+ Sparse vector multi-search queries which , by default None
183
+ non_vector_searches : list[dict[str, Any]] | None, optional
184
+ Keyword-based multi-search queries, by default None
185
+
186
+ Returns
187
+ -------
188
+ list[ElasticHitsResult]
189
+ Concatenated results
190
+ """
191
+
192
+ def _msearch_response_generator(responses: Iterable[dict[str, Any]]) -> Iterator[ElasticHitsResult]:
193
+ for query_group in responses:
194
+ for h in query_group.get("hits", {}).get("hits", []):
195
+ inner_hits = h.get("inner_hits", {})
196
+
197
+ if not inner_hits and "news" in h.get("_index"):
198
+ inner_hits = {"text": h.get("_source", {}).get("content")}
199
+ if not inner_hits and "issuelab" in h.get("_index"):
200
+ inner_hits = {"text": h.get("_source", {}).get("content")}
201
+
202
+ yield ElasticHitsResult(
203
+ index=h["_index"],
204
+ id=h["_id"],
205
+ score=h["_score"],
206
+ source=h["_source"],
207
+ inner_hits=inner_hits,
208
+ highlight=h.get("highlight", {})
209
+ )
210
+
211
+ results = []
212
+ if vector_searches is not None and len(vector_searches) > 0:
213
+ hits = multi_search_base(queries=vector_searches, credentials=SEMANTIC_ELASTIC_QA)
214
+ for hit in _msearch_response_generator(responses=hits):
215
+ results.append(hit)
216
+ if non_vector_searches is not None and len(non_vector_searches) > 0:
217
+ hits = multi_search_base(queries=non_vector_searches, credentials=NEWS_ELASTIC)
218
+ for hit in _msearch_response_generator(responses=hits):
219
+ results.append(hit)
220
+ return results
221
+
222
+
223
+ def retrieved_text(hits: dict[str, Any]) -> str:
224
+ """Extracts retrieved sub-texts from documents which are strong hits from semantic queries for the purpose of
225
+ re-scoring by a secondary language model.
226
+
227
+ Parameters
228
+ ----------
229
+ hits : dict[str, Any]
230
+
231
+ Returns
232
+ -------
233
+ str
234
+ """
235
+
236
+ nlp = CandidSmallLanguageModel()
237
+
238
+ text = []
239
+ for _, v in hits.items():
240
+ if _ == "text":
241
+ s = nlp.summarize(v, top_k=3)
242
+ text.append(s.summary)
243
+ # text.append(v)
244
+ continue
245
+
246
+ for h in (v.get("hits", {}).get("hits") or []):
247
+ for _, field in h.get("fields", {}).items():
248
+ for chunk in field:
249
+ if chunk.get("chunk"):
250
+ text.extend(chunk["chunk"])
251
+ return '\n'.join(text)
252
+
253
+
254
+ def reranker(
255
+ query_results: Iterable[ElasticHitsResult],
256
+ search_text: str | None = None,
257
+ max_num_results: int = 5
258
+ ) -> Iterator[ElasticHitsResult]:
259
+ """Reranks Elasticsearch hits coming from multiple indices/queries which may have scores on different scales.
260
+ This will shuffle results
261
+
262
+ Parameters
263
+ ----------
264
+ query_results : Iterable[ElasticHitsResult]
265
+
266
+ Yields
267
+ ------
268
+ Iterator[ElasticHitsResult]
269
+ """
270
+
271
+ results: list[ElasticHitsResult] = []
272
+ texts: list[str] = []
273
+ for _, data in groupby(query_results, key=lambda x: x.index):
274
+ data = list(data) # noqa: PLW2901
275
+ max_score = max(data, key=lambda x: x.score).score
276
+ min_score = min(data, key=lambda x: x.score).score
277
+
278
+ for d in data:
279
+ d.score = (d.score - min_score) / (max_score - min_score + 1e-9)
280
+ results.append(d)
281
+
282
+ if search_text:
283
+ if d.inner_hits:
284
+ text = retrieved_text(d.inner_hits)
285
+ if d.highlight:
286
+ highlight_texts = []
287
+ for k, v in d.highlight.items():
288
+ highlight_texts.append('\n'.join(v))
289
+ text = '\n'.join(highlight_texts)
290
+ texts.append(text)
291
+
292
+ if search_text and len(texts) == len(results) and len(texts) > 1:
293
+ logger.info("Re-ranking %d retrieval results", len(results))
294
+ scores = sparse_encoder.query_reranking(query=search_text, documents=texts)
295
+ for r, s in zip(results, scores):
296
+ r.score = s
297
+
298
+ yield from sorted(results, key=lambda x: x.score, reverse=True)[:max_num_results]
299
+
300
+
301
+ def process_hit(hit: ElasticHitsResult) -> Document:
302
+ """Process a raw Elasticsearch document into a structured langchain `Document` object.
303
+
304
+ Parameters
305
+ ----------
306
+ hit : ElasticHitsResult
307
+
308
+ Returns
309
+ -------
310
+ Document
311
+
312
+ Raises
313
+ ------
314
+ ValueError
315
+ Raised if a result from an unknown index is passed in
316
+ """
317
+
318
+ nlp = CandidSmallLanguageModel()
319
+
320
+ if "issuelab-elser" in hit.index:
321
+ doc = Document(
322
+ page_content='\n\n'.join([
323
+ hit.source.get("combined_item_description", ""),
324
+ hit.source.get("description", ""),
325
+ hit.source.get("combined_issuelab_findings", ""),
326
+ get_context("content", hit, context_length=12)
327
+ ]),
328
+ metadata={
329
+ "title": hit.source["title"],
330
+ "source": "IssueLab",
331
+ "source_id": hit.source["resource_id"],
332
+ "url": hit.source.get("permalink", "")
333
+ }
334
+ )
335
+ elif "issuelab" in hit.index:
336
+ content_summary = ""
337
+ if hit.source.get("content", ""):
338
+ content_summary = nlp.summarize(hit.source.get("content", ""), top_k=20).summary
339
+
340
+ doc = Document(
341
+ page_content='\n\n'.join([hit.source.get("description", ""), content_summary]),
342
+ metadata={
343
+ "title": hit.source["title"],
344
+ "source": "IssueLab",
345
+ "source_id": hit.source["issuelab_id"],
346
+ "url": hit.source.get("issuelab_url", "")
347
+ }
348
+ )
349
+ elif "youtube" in hit.index:
350
+ highlight = hit.highlight or {}
351
+ doc = Document(
352
+ page_content='\n\n'.join([
353
+ hit.source.get("title", ""),
354
+ hit.source.get("semantic_description", ""),
355
+ ' '.join(highlight.get("semantic_cc_text", []))
356
+ ]),
357
+ metadata={
358
+ "title": hit.source.get("title", ""),
359
+ "source": "Candid YouTube",
360
+ "source_id": hit.source['video_id'],
361
+ "url": f"https://www.youtube.com/watch?v={hit.source['video_id']}"
362
+ }
363
+ )
364
+ elif "candid-blog" in hit.index:
365
+ doc = Document(
366
+ page_content='\n\n'.join([
367
+ hit.source.get("title", ""),
368
+ hit.source.get("excerpt", ""),
369
+ get_context("content", hit, context_length=12, add_context=False),
370
+ get_context("authors_text", hit, context_length=12, add_context=False),
371
+ hit.source.get("title_summary_tags", "")
372
+ ]),
373
+ metadata={
374
+ "title": hit.source.get("title", ""),
375
+ "source": "Candid Blog",
376
+ "source_id": hit.source["id"],
377
+ "url": hit.source["link"]
378
+ }
379
+ )
380
+ elif "candid-learning" in hit.index:
381
+ doc = Document(
382
+ page_content='\n\n'.join([
383
+ hit.source.get("title", ""),
384
+ hit.source.get("staff_recommendations", ""),
385
+ hit.source.get("training_topics", ""),
386
+ get_context("content", hit, context_length=12)
387
+ ]),
388
+ metadata={
389
+ "title": hit.source["title"],
390
+ "source": "Candid Learning",
391
+ "source_id": hit.source["post_id"],
392
+ "url": hit.source.get("url", "")
393
+ }
394
+ )
395
+ elif "candid-help" in hit.index:
396
+ doc = Document(
397
+ page_content='\n\n'.join([
398
+ hit.source.get("combined_article_description", ""),
399
+ get_context("content", hit, context_length=12)
400
+ ]),
401
+ metadata={
402
+ "title": hit.source.get("title", ""),
403
+ "source": "Candid Help",
404
+ "source_id": hit.source["id"],
405
+ "url": hit.source.get("link", "")
406
+ }
407
+ )
408
+ elif "news" in hit.index:
409
+ doc = Document(
410
+ page_content='\n\n'.join([hit.source.get("title", ""), hit.source.get("content", "")]),
411
+ metadata={
412
+ "title": hit.source.get("title", ""),
413
+ "source": hit.source.get("site_name") or "Candid News",
414
+ "source_id": hit.source["id"],
415
+ "url": hit.source.get("link", "")
416
+ }
417
+ )
418
+ else:
419
+ raise ValueError(f"Unknown source result from index {hit.index}")
420
+ return doc
ask_candid/services/small_lm.py CHANGED
@@ -8,12 +8,16 @@ from ask_candid.base.lambda_base import LambdaInvokeBase
8
 
9
  @dataclass(slots=True)
10
  class Encoding:
 
 
11
  inputs: list[str]
12
  vectors: torch.Tensor
13
 
14
 
15
  @dataclass(slots=True)
16
  class SummaryItem:
 
 
17
  rank: int
18
  score: float
19
  text: str
@@ -21,6 +25,8 @@ class SummaryItem:
21
 
22
  @dataclass(slots=True)
23
  class TextSummary:
 
 
24
  snippets: list[SummaryItem]
25
 
26
  @property
@@ -28,7 +34,7 @@ class TextSummary:
28
  return ' '.join([_.text for _ in self.snippets])
29
 
30
 
31
- class CandidSLM(LambdaInvokeBase):
32
  """Wrapper around Candid's custom small language model.
33
  For more details see https://dev.azure.com/guidestar/DataScience/_git/graph-ai?path=/releases/language.
34
  This services includes:
@@ -44,7 +50,7 @@ class CandidSLM(LambdaInvokeBase):
44
  AWS secret key, by default None
45
  """
46
 
47
- class Tasks(Enum): # pylint: disable=missing-class-docstring
48
  ENCODE = "/encode"
49
  DOCUMENT_SUMMARIZE = "/document/summarize"
50
  DOCUMENT_NER_SALIENCE = "/document/entitySalience"
 
8
 
9
  @dataclass(slots=True)
10
  class Encoding:
11
+ """Text encoding vector response
12
+ """
13
  inputs: list[str]
14
  vectors: torch.Tensor
15
 
16
 
17
  @dataclass(slots=True)
18
  class SummaryItem:
19
+ """A single summary object
20
+ """
21
  rank: int
22
  score: float
23
  text: str
 
25
 
26
  @dataclass(slots=True)
27
  class TextSummary:
28
+ """Text summarization response
29
+ """
30
  snippets: list[SummaryItem]
31
 
32
  @property
 
34
  return ' '.join([_.text for _ in self.snippets])
35
 
36
 
37
+ class CandidSmallLanguageModel(LambdaInvokeBase):
38
  """Wrapper around Candid's custom small language model.
39
  For more details see https://dev.azure.com/guidestar/DataScience/_git/graph-ai?path=/releases/language.
40
  This services includes:
 
50
  AWS secret key, by default None
51
  """
52
 
53
+ class Tasks(Enum): # noqa: D106
54
  ENCODE = "/encode"
55
  DOCUMENT_SUMMARIZE = "/document/summarize"
56
  DOCUMENT_NER_SALIENCE = "/document/entitySalience"
ask_candid/tools/grants.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ from langchain_core.tools import tool
4
+ import httpx
5
+
6
+ from ask_candid.tools.utils import format_candid_profile_link
7
+ from ask_candid.base.config.rest import SEARCH
8
+
9
+ logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
10
+ logger = logging.getLogger(__name__)
11
+ logger.setLevel(logging.ERROR)
12
+
13
+
14
+ @tool
15
+ def grants_search(
16
+ query: str,
17
+ subject_codes: str | None = None,
18
+ populations_served_codes: str | None = None,
19
+ geonameids_of_geographies_served: str | None = None
20
+ ) -> list[dict[str, str | int | float | None]] | str:
21
+ """Search for historical grants to find context about what is happening in the sector, and what organizations are
22
+ involved with. This is intended for historial research purposes and contextualization. If trying to recommend
23
+ funders then please use the dedicated funder recommendation tool instead of this. Funder recommendations uses grants
24
+ and additional contexts, as well as a carefully trained graph neural network to provide targeted recommendations.
25
+
26
+ Another important note is that this tool only returns up to 25 top relevant grant results and should never be used
27
+ to make broad generalizations.
28
+
29
+ Queries are natural text, and the retrieval mechanism is a hybrid approach of keywords and sparse vector searches
30
+ over fields which describe the activity and purpose of the grant.
31
+
32
+ While extra subject codes, populations served codes, and geography IDs for where the grant is serving is not
33
+ required, grants may become more specific the more information can be provided.
34
+
35
+ Subjects and populations can be determined using the `autocode` tool if the requester can supply a description of
36
+ the program they are seeking funding for.
37
+
38
+ Geographies can be determined using the geo detection tool if the requester can supply a description of the program
39
+ they are seeking funding for.
40
+
41
+ Parameters
42
+ ----------
43
+ query : str
44
+ Text describing a user's question or a description of investigative work which requires support from Candid's
45
+ grants knowledge base
46
+ subject_codes : str | None, optional
47
+ Subject codes from Candid's PCS taxonomy, comma separated, by default None
48
+ populations_served_codes : str | None, optional
49
+ Population groups served codes from Candid's PCS taxonomy, comma separated, by default None
50
+ geonameids_of_geographies_served : str | None, optional
51
+ Geonames ID values for geographies served by the requester's program, comma separted, by default None
52
+
53
+ Examples
54
+ --------
55
+ >>> grants_search(query='homeless shelters in new york')
56
+ >>> grants_search(
57
+ query='homeless shelters in new york',
58
+ subject_codes='SS050000, SS000000,SB050000',
59
+ populations_served_codes='PJ050100',
60
+ geonameids_of_geographies_served='4094212,4094212'
61
+ )
62
+
63
+ Returns
64
+ -------
65
+ list[dict[str, str | int | float | None]] | str
66
+ Array of relevant grants and information about the organizations involved
67
+ If output is a string then that means there was some error, and retry should be considered
68
+ """
69
+
70
+ payload = {"query": query, "rowCount": 25}
71
+ if subject_codes is not None:
72
+ payload["SubjectArea"] = subject_codes.split(',')
73
+ if populations_served_codes is not None:
74
+ payload["PopulationServed"] = populations_served_codes.split(',')
75
+ if geonameids_of_geographies_served:
76
+ payload["GeographicArea"] = geonameids_of_geographies_served.split(',')
77
+
78
+ with httpx.Client(transport=httpx.HTTPTransport(retries=3), timeout=30) as client:
79
+ r = client.get(
80
+ url=SEARCH.endpoint("v1/grants/discovery"),
81
+ params=payload,
82
+ headers={**SEARCH.header} # type: ignore
83
+ )
84
+
85
+ if r.status_code != 200:
86
+ logger.error("Error calling grants search API %s. Error: %s", str(r.request.url), r.reason_phrase)
87
+ return f"Error calling grants search. Error: {r.reason_phrase}"
88
+
89
+ data: dict = r.json()
90
+
91
+ output = []
92
+ for grant in data.get("grants") or []:
93
+ working_on, serving = [], []
94
+ for facet, data in grant["pcsV3"].items():
95
+ if facet == "subject":
96
+ working_on.extend([code["name"].lower() for code in data["value"]])
97
+ elif facet == "population":
98
+ serving.extend([code["name"].lower() for code in data["value"]])
99
+
100
+ output.append({
101
+ "funder_id": grant["grantmakerId"],
102
+ "funder_profile_link": format_candid_profile_link(grant["grantmakerId"]),
103
+ "funder_name": grant["grantmakerName"],
104
+ "recipient_id": grant["recipientId"],
105
+ "recipient_profile_link": format_candid_profile_link(grant["recipientId"]),
106
+ "recipient_name": grant["recipientName"],
107
+ "fiscal_year": grant["fiscalYear"],
108
+ "amound_usd": grant["amountUsd"],
109
+ "description": grant["text"],
110
+ "working_on": f"Working on {', '.join(working_on)}",
111
+ "serving": f"Serving population groups {', '.join(serving)}",
112
+ })
113
+ return output
ask_candid/tools/letter_gen.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+
3
+ from langchain_core.tools import tool
4
+ import httpx
5
+
6
+ from ask_candid.base.config.rest import GOLDEN_ORG, LOI_WRITER
7
+
8
+
9
+ @dataclass
10
+ class LetterOfInterest:
11
+ opening: str = field(default="")
12
+ org_desc: str = field(default="")
13
+ need: str = field(default="")
14
+ project: str = field(default="")
15
+ request: str = field(default="")
16
+ conclusion: str = field(default="")
17
+
18
+ @property
19
+ def letter(self):
20
+ return f"""{self.opening}
21
+
22
+ {self.org_desc}
23
+
24
+ {self.need}
25
+
26
+ {self.project}
27
+
28
+ {self.request}
29
+
30
+ {self.conclusion}
31
+ """
32
+
33
+
34
+ @tool
35
+ def estimate_budget(
36
+ nonprofit_id: int,
37
+ funder_id: int,
38
+ project_description: str,
39
+ # ctx: Context
40
+ ) -> str:
41
+ """This is an optional tool for estimating project budgets. Some users will already know what their budget is, or
42
+ know how much money they are seeking from a grant, in which case this tool should not be used.
43
+
44
+ This tool also provides guidance on setting a budget, and ultimately the user needs to decided based on the output
45
+ from this tool
46
+
47
+ Parameters
48
+ ----------
49
+ nonprofit_id : int
50
+ The unique identifier of the requesting organization. This will need to be found from a search using inputs
51
+ elicited from the requeter
52
+ funder_id : int
53
+ The unique identifier of the funding organization which may be awarding a grant to the requester.
54
+ This will need to be found from a search using inputs elicited from the requeter, or from recommendations
55
+ project_description : str
56
+ Natural language text describing the project/program that the user is requesting funding for
57
+
58
+ Returns
59
+ -------
60
+ str
61
+ Budget guidance, including context on the funder's ability to provide the budget in question
62
+ """
63
+
64
+ recip_data = httpx.get(
65
+ url=GOLDEN_ORG.endpoint("v1/organization"),
66
+ params={"id": nonprofit_id},
67
+ headers={**GOLDEN_ORG.header}, # type: ignore
68
+ timeout=30
69
+ ).json().get("document_data", {}).get("preferred_data", {}).get("data", {})
70
+ funder_data = httpx.get(
71
+ url=GOLDEN_ORG.endpoint("v1/organization"),
72
+ params={"id": funder_id},
73
+ headers={**GOLDEN_ORG.header}, # type: ignore
74
+ timeout=30
75
+ ).json().get("document_data", {}).get("preferred_data", {}).get("data", {})
76
+ return httpx.post(
77
+ url=LOI_WRITER.endpoint("budget"),
78
+ json={
79
+ "recipient_candid_entity_id": nonprofit_id,
80
+ "program_description": project_description,
81
+ "recipient_data": recip_data,
82
+ "funder_data": funder_data
83
+ },
84
+ headers={**LOI_WRITER.header}, # type: ignore
85
+ timeout=30
86
+ ).json().get("response", "No budget could be estimated")
87
+
88
+
89
+ @tool
90
+ def draft_loi(
91
+ nonprofit_id: int,
92
+ funder_id: int,
93
+ project_description: str,
94
+ budget: int,
95
+ ) -> str:
96
+ """Generate a letter of interest/intent from a chain-of-thought prompt chain using Candid's golden data and any
97
+ inputs specified by the user, and/or recommended content.
98
+
99
+ The output of this tool is the actual letter draft, please do not make changes to it other than adding headers
100
+ and/or footers.
101
+
102
+ Parameters
103
+ ----------
104
+ nonprofit_id : int
105
+ The unique identifier of the requesting organization. This will need to be found from a search using inputs
106
+ elicited from the requeter
107
+ funder_id : int
108
+ The unique identifier of the funding organization which may be awarding a grant to the requester.
109
+ This will need to be found from a search using inputs elicited from the requeter, or from recommendations
110
+ project_description : str
111
+ Natural language text describing the project/program that the user is requesting funding for
112
+ budget : int
113
+ The dollar amount (in USD) that the user is requesting for funding. This should be specified by the user,
114
+ prompt for this if needed.
115
+
116
+ Returns
117
+ -------
118
+ str
119
+ Formatted letter of interest
120
+ """
121
+
122
+ client = httpx.Client(headers={**LOI_WRITER.header}, timeout=30, base_url=LOI_WRITER.url) # type: ignore
123
+
124
+ def _make_request(ept: str, payload: dict):
125
+ # return httpx.get(
126
+ # url=LOI_WRITER.endpoint(ept),
127
+ # params=payload,
128
+ # headers={**LOI_WRITER.header}, # type: ignore
129
+ # timeout=30
130
+ # ).json().get("response", "")
131
+
132
+ return client.get(url=LOI_WRITER.endpoint(ept), params=payload).json().get("response", "")
133
+
134
+ data = _make_request(
135
+ ept="organization/autofill",
136
+ payload={"recipient_candid_entity_id": nonprofit_id, "funder_candid_entity_id": funder_id}
137
+ )
138
+
139
+ recip: dict = data.get("recipient_data", {})
140
+ funder: dict = data.get("funder_data", {})
141
+ pair_history: str = data.get("funding_history_text", "")
142
+
143
+ sections = (
144
+ ("opening", "writer/opening"),
145
+ ("organization description", "writer/org"),
146
+ ("need statement", "writer/need"),
147
+ ("project description", "writer/project"),
148
+ ("funding request", "writer/fund"),
149
+ ("conclusion", "writer/conclusion")
150
+ )
151
+
152
+ output = LetterOfInterest()
153
+ for _, (section, endpoint) in enumerate(sections, start=1):
154
+ if section == "opening":
155
+ output.opening = _make_request(
156
+ ept=endpoint,
157
+ payload={
158
+ "funder_name": [
159
+ n["name"] for n in funder.get("org_data", {}).get("names", [])
160
+ if n["name_type"] == "main"
161
+ ][0],
162
+ "recipient_name": [
163
+ n["name"] for n in recip.get("org_data", {}).get("names", [])
164
+ if n["name_type"] == "main"
165
+ ][0],
166
+ "project_purpose": project_description,
167
+ "amount": budget,
168
+ "prior_contact": None,
169
+ "connection": None
170
+ }
171
+ )
172
+ elif section == "organization description":
173
+ output.org_desc = _make_request(
174
+ ept=endpoint,
175
+ payload={
176
+ "opening": output.opening,
177
+ "history": pair_history,
178
+ "recipient_mission_statement": recip.get("mission_statement_text", ""),
179
+ "capacity": recip.get("capacity_text", ""),
180
+ "path": None,
181
+ "accomplishment": recip.get("data_text", "")
182
+ }
183
+ )
184
+ elif section == "need statement":
185
+ output.need = httpx.get(
186
+ url=GOLDEN_ORG.endpoint(endpoint),
187
+ params={
188
+ "recipient_desc": output.org_desc,
189
+ "funder_mission_statement": funder.get("mission_statement_text", ""),
190
+ "target": None,
191
+ "data": None,
192
+ },
193
+ headers={**GOLDEN_ORG.header}, # type: ignore
194
+ timeout=30
195
+ ).json().get("response", "")
196
+ elif section == "project description":
197
+ output.project = _make_request(
198
+ ept=endpoint,
199
+ payload={
200
+ "need": output.need,
201
+ "projects": project_description,
202
+ "desired_objectives": None,
203
+ "major_activities": None,
204
+ "key_staff": None,
205
+ "stand_out": None,
206
+ "success": None
207
+ }
208
+ )
209
+ elif section == "funding request":
210
+ output.request = _make_request(
211
+ ept=endpoint,
212
+ payload={
213
+ "project_desc": output.project,
214
+ "amount": budget,
215
+ "funding_history": pair_history,
216
+ "other_funding": None,
217
+ }
218
+ )
219
+ elif section == "conclusion":
220
+ output.conclusion = _make_request(
221
+ ept=endpoint,
222
+ payload={
223
+ "funding_request": output.request,
224
+ "project_desc": output.project,
225
+ "follow_up": recip.get("contact_text", ""),
226
+ }
227
+ )
228
+
229
+ client.close()
230
+ return output.letter
ask_candid/tools/nlp.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+ import logging
3
+
4
+ from langchain_core.tools import tool
5
+ import httpx
6
+
7
+ from ask_candid.base.utils import retry_on_status
8
+ from ask_candid.base.config.rest import AUTOCODING, DOCUMENT
9
+
10
+ logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
11
+ logger = logging.getLogger(__name__)
12
+ logger.setLevel(logging.ERROR)
13
+
14
+
15
+ @retry_on_status(num_retries=3)
16
+ def get_with_retries(url: str, payload: dict[str, Any] | None, headers: dict[str, str] | None):
17
+ with httpx.Client(transport=httpx.HTTPTransport(retries=3), timeout=30) as client:
18
+ return client.get(url=url, params=payload, headers=headers)
19
+
20
+
21
+ @tool
22
+ def autocode(text: str) -> dict[str, list] | str:
23
+ """Uses natural language processing to align some input text to Candid's taxonomy: https://taxonomy.candid.org.
24
+ The taxonomy describes activity in the social and philanthropic sectors.
25
+
26
+ Parameters
27
+ ----------
28
+ text : str
29
+ Text describing working in the social sector. This should be related to the social and/or philanthropic sector.
30
+
31
+ Returns
32
+ -------
33
+ dict[str, list] | str
34
+ Taxonomy responses. The keys of the dictionary are individual taxonomy facets, and the items in the dictionary
35
+ are each term which the NLP model has determined is relevant giving the input text. This also includes
36
+ confidence score.
37
+ """
38
+
39
+ r = httpx.get(
40
+ url=AUTOCODING.endpoint("predict"),
41
+ params={"text": text},
42
+ headers={**AUTOCODING.header} # type: ignore
43
+ )
44
+
45
+ if r.status_code != 200:
46
+ logger.error("Error calling autocoding API %s. Error: %s", str(r.request.url), r.reason_phrase)
47
+ return f"Error calling autocoding. Error: {r.reason_phrase}"
48
+
49
+ data: dict = r.json().get("data", {})
50
+ return {k: v for k, v in data.items() if k in {"subject", "population"}}
51
+
52
+
53
+ @tool
54
+ def geo_detect(text: str) -> list[dict[str, Any]] | str:
55
+ """Uses natural language processing to find and match named geographies found in the supplied text. The output
56
+ will supply identified geographies from [Geonames](https://www.geonames.org/).
57
+
58
+ Parameters
59
+ ----------
60
+ text : str
61
+ Text describing working in the social sector. This should be related to the social and/or philanthropic sector.
62
+
63
+ Returns
64
+ -------
65
+ list[dict[str, Any]] | str
66
+ Matched geographies responses. This is an array of JSON objects which contain the `name` of the geography as it
67
+ appeared in the supplied text, and the best match to a Geonames geography. For many Candid knowledge tools the
68
+ `geonames_id` value will be most useful.
69
+ If output is a string then that means there was some error, and retry should be considered
70
+ """
71
+
72
+ r = get_with_retries(
73
+ url=DOCUMENT.endpoint("entities/geographies"),
74
+ payload={"text": text, "only_best_match": True},
75
+ headers={**DOCUMENT.header}
76
+ )
77
+ assert isinstance(r, httpx.Response)
78
+ if r.status_code != 200:
79
+ logger.error("Error calling geo detection API %s. Error: %s", str(r.request.url), r.reason_phrase)
80
+ return f"Error calling geo detection. Error: {r.reason_phrase}"
81
+
82
+ data: dict = r.json().get("entities", [])
83
+ return [{"name": entity["name"], "match": entity["match"][:1]} for entity in data if entity.get("type") == "geo"]
ask_candid/tools/recommendations.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal, Any
2
+ import logging
3
+
4
+ from langchain_core.tools import tool
5
+ import httpx
6
+
7
+ from ask_candid.tools.utils import format_candid_profile_link
8
+ from ask_candid.base.utils import retry_on_status
9
+ from ask_candid.base.config.rest import FUNDER_RECOMMENDATION, SEARCH
10
+
11
+ logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
12
+ logger = logging.getLogger(__name__)
13
+ logger.setLevel(logging.ERROR)
14
+
15
+
16
+ @retry_on_status(num_retries=3)
17
+ def get_with_retries(url: str, payload: dict[str, Any] | None, headers: dict[str, str] | None) -> httpx.Response:
18
+ with httpx.Client(transport=httpx.HTTPTransport(retries=3), timeout=30) as client:
19
+ return client.get(url=url, params=payload, headers=headers)
20
+
21
+
22
+ @tool
23
+ def organization_search(
24
+ query: str,
25
+ located_postal_code: str | None = None,
26
+ located_admin1: str | None = None,
27
+ search_mode: Literal["organization_only", "organization_and_grants"] | None = "organization_only"
28
+ ) -> list[dict[str, str | None]] | str:
29
+ """Search for organizations by name, description or work, program descriptions and locations. Here are some
30
+ guidelines:
31
+ * `query` controls hybrid searching involving both vector search and keyword search
32
+ * `query` can be used to find organizations based on a description of work
33
+ * if the query is intended to be a lookup of an organization by name, then adding quotes around the `query` string
34
+ circumvents vector search, and prioritizes keyword matching on names (eg. `query=Candid` --> `query='Candid'`)
35
+ * if the query is an EIN (eg. 12-3456789) then keyword searching is prioritized to get exact matches
36
+ * adding location information such as postal codes and/or admin1 (state/province abbreviations) will filter results
37
+
38
+ This tool should be used as a first step in any downstream task which requires identifying the nonprofit that the
39
+ user is identifying with. Often, the `nonprofit_id` is required, and that can be found via a search.
40
+
41
+ Parameters
42
+ ----------
43
+ query : str
44
+ Free text query which drives the search functionality. This uses a hybrid approach of vector and keyword
45
+ searching, but under certain conditions expressed in the 'guidelines' this may disable vector search.
46
+ located_postal_code : str | None, optional
47
+ Postal code of the organization to be searched, if provided, by default None
48
+ located_admin1 : str | None, optional
49
+ Admin1 code (state/province abbreviation) of the organization to be searched, if provided, by default None
50
+ search_mode : Literal["organization_only", "organization_and_grants"] | None, optional
51
+ Choose how to search for organizations, if `None` or "organization_and_grants" then this will examine evidence
52
+ at the organization level as well as at the historical grant transaction level capturing activity evidence. For
53
+ name lookups it is best to use the "organization_only" default value, by default "organization_only"
54
+
55
+ Returns
56
+ -------
57
+ list[dict[str, str]] | str
58
+ List of the top organization search results
59
+ If output is a string then that means there was some error, and retry should be considered
60
+ """
61
+
62
+ payload = {"query": query, "searchMode": search_mode, "rowCount": 5}
63
+ if located_postal_code is not None:
64
+ payload["postalCode"] = located_postal_code
65
+ if located_admin1 is not None:
66
+ payload["admin1"] = located_admin1
67
+
68
+ with httpx.Client(transport=httpx.HTTPTransport(retries=3), timeout=30) as client:
69
+ r = client.get(
70
+ url=SEARCH.endpoint("v1/search"),
71
+ params=payload,
72
+ headers={**SEARCH.header} # type: ignore
73
+ )
74
+
75
+ if r.status_code != 200:
76
+ logger.error("Error calling organization search API %s. Error: %s", str(r.request.url), r.reason_phrase)
77
+ return f"Error calling organization search. Error: {r.reason_phrase}"
78
+
79
+ data: dict = r.json()
80
+
81
+ output = []
82
+ for org in data.get("returnedOrgs") or []:
83
+ working_on, serving = [], []
84
+ for code, description in org["taxonomy"].items():
85
+ code: str
86
+ description: str
87
+
88
+ if code.startswith('P') and len(code) > 2:
89
+ serving.append(description.lower())
90
+ elif code.startswith('S'):
91
+ working_on.append(description.lower())
92
+
93
+ output.append({
94
+ "nonprofit_id": org["candidEntityID"],
95
+ "name": org["orgName"],
96
+ "aka_name": org["akaName"],
97
+ "acronym": org["acronymName"],
98
+ "city": org["city"],
99
+ "admin1": org["admin1"],
100
+ "country": org["countryName"],
101
+ "EIN": org["ein"],
102
+ "profile_link": format_candid_profile_link(org['candidEntityID']),
103
+ "working_on": f"Working on {', '.join(working_on)}",
104
+ "serving": f"Serving population groups {', '.join(serving)}",
105
+ "transparency_level": org["seal"].get("description"),
106
+ "organization_roles": ', '.join(org["roles"]),
107
+ "grants_awarded": ', '.join([f"{k}: {v}" for k, v in org["transactionsGiven"].items()]),
108
+ "grants_received": ', '.join([f"{k}: {v}" for k, v in org["transactionsReceived"].items()])
109
+ })
110
+ return output
111
+
112
+
113
+ @tool
114
+ def recommend_funders(
115
+ nonprofit_id: int,
116
+ subject_codes_of_program: str | None = None,
117
+ populations_served_codes_of_program: str | None = None,
118
+ geonameids_of_geographies_served: str | None = None,
119
+ include_past_funders: bool = False
120
+ ) -> tuple[dict[str, Any], list[dict[str, Any]]] | str:
121
+ """Recommend potential funding organizations to a nonprofit seeking a grant.
122
+
123
+ These recommendations are built using machine learning over a heterogeneous knowledge graph representing the work of
124
+ the requesting organization, and the contextual recent activities of potential funders, and their grant recipients.
125
+
126
+ While extra subject codes, populations served codes, and geography IDs for where the program takes place is not
127
+ required, recommendations tend to improve and become more specific the more information can be provided.
128
+
129
+ Subjects and populations can be determined using the `autocode` tool if the requester can supply a description of
130
+ the program they are seeking funding for.
131
+
132
+ Geographies can be determined using the geo detection tool if the requester can supply a description of the program
133
+ they are seeking funding for.
134
+
135
+ Key Usage Requirements:
136
+ - Always incorporate returned profile URLs directly into the response text
137
+ - Replace funding organization name mentions with hyperlinked Candid profile URLs
138
+ - Prioritize creating a seamless user experience by making URLs contextually relevant
139
+ - Use relevant recipient data as well as inferred metadata to provide explanations about recommendation relevance
140
+
141
+ Parameters
142
+ ----------
143
+ nonprofit_id : int
144
+ The unique identifier of the requesting organization. This will need to be found from a search using inputs
145
+ elicited from the requester
146
+ subject_codes_of_program : str | None, optional
147
+ Subject codes from Candid's PCS taxonomy, comma separated, by default None
148
+ populations_served_codes_of_program : str | None, optional
149
+ Population groups served codes from Candid's PCS taxonomy, comma separated, by default None
150
+ geonameids_of_geographies_served : str | None, optional
151
+ Geonames ID values for geographies served by the requester's program, comma separted, by default None
152
+ include_past_funders : bool, optional
153
+ Boolean flag to indicate whether previous funders of the input organization identified by the `nonprofit_id`
154
+ should be excluded. If the requester would like to reconsider previous funding organizations then set this to
155
+ `True`, but the requester MUST be prompted to indicate this preference. Using the default value will help the
156
+ requester discover new, potentially relevant funders, by default False
157
+
158
+ Examples
159
+ --------
160
+ >>> recommend_funders(nonprofit_id=9981881)
161
+ >>> reccommend_funders(
162
+ nonprofit_id=9173173,
163
+ subject_codes_of_program='SS050000, SS000000,SB050000',
164
+ populations_served_codes_of_program='PJ050100',
165
+ geonameids_of_geographies_served='4094212,4094212'
166
+ )
167
+
168
+ Returns
169
+ -------
170
+ tuple[dict[str, Any], list[dict[str, Any]]] | str
171
+ (Inferred data used to generate recommendations, array of funders being recommended)
172
+ If output is a string then that means there was some error, and retry should be considered
173
+ """
174
+
175
+ payload = {
176
+ "candid_entity_id": nonprofit_id,
177
+ "use_programs": True,
178
+ "top_k": 5,
179
+ "include_past_funders": include_past_funders
180
+ }
181
+
182
+ if subject_codes_of_program is not None:
183
+ payload["subjects"] = subject_codes_of_program
184
+ if populations_served_codes_of_program is not None:
185
+ payload["populations"] = populations_served_codes_of_program
186
+ if geonameids_of_geographies_served:
187
+ payload["geos"] = geonameids_of_geographies_served
188
+
189
+ r = get_with_retries(
190
+ url=FUNDER_RECOMMENDATION.endpoint("funder/pcs-v3"),
191
+ payload=payload,
192
+ headers={**FUNDER_RECOMMENDATION.header}
193
+ )
194
+ assert isinstance(r, httpx.Response)
195
+ if r.status_code != 200:
196
+ logger.error("Error calling funder recommendations API %s. Error: %s", str(r.request.url), r.reason_phrase)
197
+ return f"Error calling funder recommendations. Error: {r.reason_phrase}"
198
+
199
+ data: dict = r.json()
200
+ return (
201
+ data.get("meta") or {},
202
+ [{
203
+ **r,
204
+ "candid_profile_url": format_candid_profile_link(r['funder_id'])
205
+ } for r in (data.get("recommendations") or [])]
206
+ )
207
+
208
+
209
+ @tool
210
+ def recommend_funding_opportunities(
211
+ nonprofit_id: int,
212
+ subject_codes_of_program: str | None = None,
213
+ populations_served_codes_of_program: str | None = None,
214
+ geonameids_of_geographies_served: str | None = None
215
+ ) -> tuple[dict[str, Any], list[dict[str, Any]]] | str:
216
+ """Recommend active funding opportunities (RFPs) to a nonprofit seeking a grant.
217
+
218
+ These recommendations are built using machine learning over a heterogeneous knowledge graph representing the work of
219
+ the requesting organization, and the contextual recent activities of potential funders, and their grant recipients.
220
+
221
+ While extra subject codes, populations served codes, and geography IDs for where the program takes place is not
222
+ required, recommendations tend to improve and become more specific the more information can be provided.
223
+
224
+ Subjects and populations can be determined using the `autocode` tool if the requester can supply a description of
225
+ the program they are seeking funding for.
226
+
227
+ Key Usage Requirements:
228
+ - Always incorporate returned profile URLs directly into the response text
229
+ - Replace funding organization name mentions with hyperlinked Candid profile URLs
230
+ - Prioritize creating a seamless user experience by making URLs contextually relevant
231
+ - Use inferred metadata to provide explanations about recommendation relevance
232
+
233
+ Parameters
234
+ ----------
235
+ nonprofit_id : int
236
+ The unique identifier of the requesting organization. This will need to be found from a search using inputs
237
+ elicited from the requeter
238
+ subject_codes_of_program : str | None, optional
239
+ Subject codes from Candid's PCS taxonomy, comma separated, by default None
240
+ populations_served_codes_of_program : str | None, optional
241
+ Population groups served codes from Candid's PCS taxonomy, comma separated, by default None
242
+ geonameids_of_geographies_served : str | None, optional
243
+ Geonames ID values for geographies served by the requester's program, comma separted, by default None
244
+
245
+ Examples
246
+ --------
247
+ >>> recommend_funding_opportunities(nonprofit_id=9981881)
248
+ >>> recommend_funding_opportunities(
249
+ nonprofit_id=9173173,
250
+ subject_codes_of_program='SS050000, SS000000,SB050000',
251
+ populations_served_codes_of_program='PJ050100',
252
+ geonameids_of_geographies_served='4094212,4094212'
253
+ )
254
+
255
+ Returns
256
+ -------
257
+ tuple[dict[str, Any], list[dict[str, Any]]] | str
258
+ (Inferred data used to generate recommendations, array of active funding opportunities being recommended)
259
+ If output is a string then that means there was some error, and retry should be considered
260
+ """
261
+
262
+ payload = {"candid_entity_id": nonprofit_id, "use_programs": True, "top_k": 5}
263
+ if subject_codes_of_program is not None:
264
+ payload["subjects"] = subject_codes_of_program
265
+ if populations_served_codes_of_program is not None:
266
+ payload["populations"] = populations_served_codes_of_program
267
+ if geonameids_of_geographies_served:
268
+ payload["geos"] = geonameids_of_geographies_served
269
+
270
+ r = get_with_retries(
271
+ url=FUNDER_RECOMMENDATION.endpoint("rfp/pcs-v3"),
272
+ payload=payload,
273
+ headers={**FUNDER_RECOMMENDATION.header}
274
+ )
275
+ assert isinstance(r, httpx.Response)
276
+ if r.status_code != 200:
277
+ logger.error("Error calling RFP recommendation API %s. Error: %s", str(r.request.url), r.reason_phrase)
278
+ return f"Error calling RFP recommendations. Error: {r.reason_phrase}"
279
+
280
+ data: dict = r.json()
281
+ return (
282
+ data.get("meta") or {},
283
+ [{
284
+ **r,
285
+ "candid_profile_url": format_candid_profile_link(r['funder_id'])
286
+ } for r in (data.get("recommendations") or [])]
287
+ )
ask_candid/tools/search.py CHANGED
@@ -1,7 +1,7 @@
1
  from langchain_core.documents import Document
2
  from langchain_core.tools import tool
3
 
4
- from ask_candid.base.retrieval.knowledge_base import (
5
  SourceNames,
6
  generate_queries,
7
  run_search,
 
1
  from langchain_core.documents import Document
2
  from langchain_core.tools import tool
3
 
4
+ from ask_candid.services.knowledge_base import (
5
  SourceNames,
6
  generate_queries,
7
  run_search,
ask_candid/utils.py CHANGED
@@ -1,47 +1,15 @@
1
- from typing import List, Dict, Union, Any
2
  from uuid import uuid4
3
 
4
  from langchain_core.documents import Document
5
 
6
- from ask_candid.retrieval.sources import (
7
- candid_blog,
8
- candid_help,
9
- candid_learning,
10
- issuelab,
11
- youtube
12
- )
13
 
14
-
15
- def html_format_doc(doc: Dict[str, Any], source: str, show_chunks=False) -> str:
16
- height_px = 200
17
- html = ""
18
-
19
- if source == "news":
20
- # html = news.article_card_html(doc, height_px, show_chunks)
21
- pass
22
- elif source == "transactions":
23
- pass
24
- elif source == "organizations":
25
- pass
26
- elif source == "issuelab":
27
- html = issuelab.issuelab_card_html(doc, height_px, show_chunks)
28
- elif source == "youtube":
29
- html = youtube.build_card_html(doc, 400, show_chunks)
30
- elif source == "candid_blog":
31
- html = candid_blog.build_card_html(doc, height_px, show_chunks)
32
- elif source == "candid_learning":
33
- html = candid_learning.build_card_html(doc, height_px, show_chunks)
34
- elif source == "candid_help":
35
- html = candid_help.build_card_html(doc, height_px, show_chunks)
36
- return html
37
-
38
-
39
- def html_format_docs_chat(docs: List[Document]) -> str:
40
  """Formats Candid sources
41
 
42
  Parameters
43
  ----------
44
- docs : List[Document]
45
  Retrieved documents for context
46
 
47
  Returns
@@ -69,7 +37,7 @@ def html_format_docs_chat(docs: List[Document]) -> str:
69
  return html
70
 
71
 
72
- def format_chat_ag_response(chatbot: List[Any]) -> List[Any]:
73
  """If we called retriever, we appended sources as as one more message. Here we concatinate HTML of sources
74
  with the AI response
75
  Returns:
@@ -89,7 +57,7 @@ def valid_inputs(*args) -> bool:
89
  return any(a is not None or (isinstance(a, str) and a.strip() != '') for a in args)
90
 
91
 
92
- def get_session_id(thread_id: Union[str, None]) -> str:
93
  if not thread_id:
94
  thread_id = uuid4().hex
95
  return thread_id
 
1
+ from typing import Any
2
  from uuid import uuid4
3
 
4
  from langchain_core.documents import Document
5
 
 
 
 
 
 
 
 
6
 
7
+ def html_format_docs_chat(docs: list[Document]) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  """Formats Candid sources
9
 
10
  Parameters
11
  ----------
12
+ docs : list[Document]
13
  Retrieved documents for context
14
 
15
  Returns
 
37
  return html
38
 
39
 
40
+ def format_chat_ag_response(chatbot: list[Any]) -> list[Any]:
41
  """If we called retriever, we appended sources as as one more message. Here we concatinate HTML of sources
42
  with the AI response
43
  Returns:
 
57
  return any(a is not None or (isinstance(a, str) and a.strip() != '') for a in args)
58
 
59
 
60
+ def get_session_id(thread_id: str | None) -> str:
61
  if not thread_id:
62
  thread_id = uuid4().hex
63
  return thread_id
requirements.txt CHANGED
@@ -9,7 +9,7 @@ langgraph-prebuilt==0.6.4
9
  pydantic==2.10.6
10
  pyopenssl>22.0.0
11
  python-dotenv
12
- transformers
13
 
14
  --find-links https://download.pytorch.org/whl/cpu
15
  torch
 
9
  pydantic==2.10.6
10
  pyopenssl>22.0.0
11
  python-dotenv
12
+ transformers>=4.56.1
13
 
14
  --find-links https://download.pytorch.org/whl/cpu
15
  torch