Spaces:
Running
Running
Update issuelab query technique
Browse files* quasi-sparse vector approach for issuelab
* summarize issuelab articles prior to LLM handoff
* pin transformers version to avoid major bug
- ask_candid/base/retrieval/elastic.py +76 -8
- ask_candid/base/retrieval/sources.py +9 -2
- ask_candid/services/knowledge_base.py +420 -0
- ask_candid/services/small_lm.py +8 -2
- ask_candid/tools/grants.py +113 -0
- ask_candid/tools/letter_gen.py +230 -0
- ask_candid/tools/nlp.py +83 -0
- ask_candid/tools/recommendations.py +287 -0
- ask_candid/tools/search.py +1 -1
- ask_candid/utils.py +5 -37
- requirements.txt +1 -1
ask_candid/base/retrieval/elastic.py
CHANGED
@@ -21,14 +21,14 @@ def build_sparse_vector_query(
|
|
21 |
----------
|
22 |
query : str
|
23 |
Search context string
|
24 |
-
fields :
|
25 |
Semantic text field names
|
26 |
inference_id : str, optional
|
27 |
ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
|
28 |
|
29 |
Returns
|
30 |
-------
|
31 |
-
|
32 |
"""
|
33 |
|
34 |
output = []
|
@@ -70,20 +70,20 @@ def build_sparse_vector_and_text_query(
|
|
70 |
----------
|
71 |
query : str
|
72 |
Search context string
|
73 |
-
semantic_fields :
|
74 |
Semantic text field names
|
75 |
-
highlight_fields:
|
76 |
Fields which relevant chunks will be helpful for the agent to read
|
77 |
-
text_fields :
|
78 |
Regular text fields
|
79 |
-
excluded_fields :
|
80 |
Fields to exclude from the source
|
81 |
inference_id : str, optional
|
82 |
ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
|
83 |
|
84 |
Returns
|
85 |
-------
|
86 |
-
|
87 |
"""
|
88 |
|
89 |
output = []
|
@@ -149,7 +149,7 @@ def news_query_builder(
|
|
149 |
|
150 |
Returns
|
151 |
-------
|
152 |
-
|
153 |
"""
|
154 |
|
155 |
tokens = encoder.token_expand(query)
|
@@ -180,11 +180,79 @@ def news_query_builder(
|
|
180 |
return elastic_query
|
181 |
|
182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
def multi_search_base(
|
184 |
queries: list[dict[str, Any]],
|
185 |
credentials: BaseElasticSearchConnection | BaseElasticAPIKeyCredential,
|
186 |
timeout: int = 180
|
187 |
) -> Iterator[dict[str, Any]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
if isinstance(credentials, BaseElasticAPIKeyCredential):
|
189 |
es = Elasticsearch(
|
190 |
cloud_id=credentials.cloud_id,
|
|
|
21 |
----------
|
22 |
query : str
|
23 |
Search context string
|
24 |
+
fields : tuple[str, ...]
|
25 |
Semantic text field names
|
26 |
inference_id : str, optional
|
27 |
ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
|
28 |
|
29 |
Returns
|
30 |
-------
|
31 |
+
dict[str, Any]
|
32 |
"""
|
33 |
|
34 |
output = []
|
|
|
70 |
----------
|
71 |
query : str
|
72 |
Search context string
|
73 |
+
semantic_fields : tuple[str]
|
74 |
Semantic text field names
|
75 |
+
highlight_fields: tuple[str]
|
76 |
Fields which relevant chunks will be helpful for the agent to read
|
77 |
+
text_fields : tuple[str]
|
78 |
Regular text fields
|
79 |
+
excluded_fields : tuple[str]
|
80 |
Fields to exclude from the source
|
81 |
inference_id : str, optional
|
82 |
ID of model deployed in Elasticsearch, by default ".elser-2-elasticsearch"
|
83 |
|
84 |
Returns
|
85 |
-------
|
86 |
+
dict[str, Any]
|
87 |
"""
|
88 |
|
89 |
output = []
|
|
|
149 |
|
150 |
Returns
|
151 |
-------
|
152 |
+
dict[str, Any]
|
153 |
"""
|
154 |
|
155 |
tokens = encoder.token_expand(query)
|
|
|
180 |
return elastic_query
|
181 |
|
182 |
|
183 |
+
def issuelab_query_builder(
|
184 |
+
query: str,
|
185 |
+
fields: tuple[str, ...],
|
186 |
+
highlight_fields: tuple[str, ...] | None,
|
187 |
+
encoder: SpladeEncoder,
|
188 |
+
) -> dict[str, Any]:
|
189 |
+
"""Builds a valid Elasticsearch query against Issuelab, simulating a token expansion.
|
190 |
+
|
191 |
+
Parameters
|
192 |
+
----------
|
193 |
+
query : str
|
194 |
+
Search context string
|
195 |
+
|
196 |
+
Returns
|
197 |
+
-------
|
198 |
+
dict[str, Any]
|
199 |
+
"""
|
200 |
+
|
201 |
+
tokens = encoder.token_expand(query)
|
202 |
+
|
203 |
+
elastic_query = {
|
204 |
+
"_source": ["issuelab_id", "issuelab_url", "title", "description", "content"],
|
205 |
+
"query": {
|
206 |
+
"bool": {
|
207 |
+
# "filter": [
|
208 |
+
# # {"range": {"event_date": {"gte": f"now-{days_ago}d/d"}}},
|
209 |
+
# # {"range": {"insert_date": {"gte": f"now-{days_ago}d/d"}}},
|
210 |
+
# # {"range": {"article_trust_worthiness": {"gt": NEWS_TRUST_SCORE_THRESHOLD}}}
|
211 |
+
# ],
|
212 |
+
"should": []
|
213 |
+
}
|
214 |
+
},
|
215 |
+
"highlight": {
|
216 |
+
"fields": dict.fromkeys(highlight_fields or ("content", "description"), {})
|
217 |
+
}
|
218 |
+
}
|
219 |
+
|
220 |
+
for token, score in tokens.items():
|
221 |
+
if score > SPARSE_ENCODING_SCORE_THRESHOLD:
|
222 |
+
elastic_query["query"]["bool"]["should"].append({
|
223 |
+
"multi_match": {
|
224 |
+
"query": token,
|
225 |
+
"fields": fields,
|
226 |
+
"boost": score
|
227 |
+
}
|
228 |
+
})
|
229 |
+
return elastic_query
|
230 |
+
|
231 |
+
|
232 |
def multi_search_base(
|
233 |
queries: list[dict[str, Any]],
|
234 |
credentials: BaseElasticSearchConnection | BaseElasticAPIKeyCredential,
|
235 |
timeout: int = 180
|
236 |
) -> Iterator[dict[str, Any]]:
|
237 |
+
"""Handles multi-search queries on a single cluster given the relevant credetials object
|
238 |
+
|
239 |
+
Parameters
|
240 |
+
----------
|
241 |
+
queries : list[dict[str, Any]]
|
242 |
+
`msearch` query object, (see: https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-msearch)
|
243 |
+
credentials : BaseElasticSearchConnection | BaseElasticAPIKeyCredential
|
244 |
+
timeout : int, optional, by default 180
|
245 |
+
|
246 |
+
Yields
|
247 |
+
------
|
248 |
+
Iterator[dict[str, Any]]
|
249 |
+
|
250 |
+
Raises
|
251 |
+
------
|
252 |
+
TypeError
|
253 |
+
Raised if invalid credentials are passed
|
254 |
+
"""
|
255 |
+
|
256 |
if isinstance(credentials, BaseElasticAPIKeyCredential):
|
257 |
es = Elasticsearch(
|
258 |
cloud_id=credentials.cloud_id,
|
ask_candid/base/retrieval/sources.py
CHANGED
@@ -25,9 +25,16 @@ CandidNewsConfig = ElasticSourceConfig(
|
|
25 |
)
|
26 |
|
27 |
|
|
|
|
|
|
|
|
|
|
|
28 |
IssueLabConfig = ElasticSourceConfig(
|
29 |
-
index_name="
|
30 |
-
semantic_fields=("
|
|
|
|
|
31 |
)
|
32 |
|
33 |
|
|
|
25 |
)
|
26 |
|
27 |
|
28 |
+
# IssueLabConfig = ElasticSourceConfig(
|
29 |
+
# index_name="search-semantic-issuelab-elser_ve2",
|
30 |
+
# semantic_fields=("description", "content", "combined_issuelab_findings", "combined_item_description")
|
31 |
+
# )
|
32 |
+
|
33 |
IssueLabConfig = ElasticSourceConfig(
|
34 |
+
index_name="issuelab_prod_data",
|
35 |
+
# semantic_fields=("title", "description", "content"),
|
36 |
+
semantic_fields=("title", "description", "content^0.3"),
|
37 |
+
highlight_fields=("description", "content")
|
38 |
)
|
39 |
|
40 |
|
ask_candid/services/knowledge_base.py
ADDED
@@ -0,0 +1,420 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Literal, Any
|
2 |
+
from collections.abc import Iterator, Iterable
|
3 |
+
from itertools import groupby
|
4 |
+
import logging
|
5 |
+
|
6 |
+
from langchain_core.documents import Document
|
7 |
+
|
8 |
+
from ask_candid.base.retrieval.elastic import (
|
9 |
+
build_sparse_vector_query,
|
10 |
+
build_sparse_vector_and_text_query,
|
11 |
+
news_query_builder,
|
12 |
+
issuelab_query_builder,
|
13 |
+
multi_search_base
|
14 |
+
)
|
15 |
+
from ask_candid.base.retrieval.sparse_lexical import SpladeEncoder
|
16 |
+
from ask_candid.base.retrieval.schemas import ElasticHitsResult
|
17 |
+
import ask_candid.base.retrieval.sources as S
|
18 |
+
from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA, NEWS_ELASTIC
|
19 |
+
from ask_candid.services.small_lm import CandidSmallLanguageModel
|
20 |
+
|
21 |
+
SourceNames = Literal[
|
22 |
+
"Candid Blog",
|
23 |
+
"Candid Help",
|
24 |
+
"Candid Learning",
|
25 |
+
"Candid News",
|
26 |
+
"IssueLab Research Reports",
|
27 |
+
"YouTube Training"
|
28 |
+
]
|
29 |
+
sparse_encoder = SpladeEncoder()
|
30 |
+
logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
|
31 |
+
logger = logging.getLogger(__name__)
|
32 |
+
logger.setLevel(logging.INFO)
|
33 |
+
|
34 |
+
|
35 |
+
# TODO remove
|
36 |
+
def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
|
37 |
+
"""Pads the relevant chunk of text with context before and after
|
38 |
+
|
39 |
+
Parameters
|
40 |
+
----------
|
41 |
+
field_name : str
|
42 |
+
a field with the long text that was chunked into pieces
|
43 |
+
hit : ElasticHitsResult
|
44 |
+
context_length : int, optional
|
45 |
+
length of text to add before and after the chunk, by default 1024
|
46 |
+
add_context : bool, optional
|
47 |
+
Set to `False` to expand the text context by searching for the Elastic inner hit inside the larger document
|
48 |
+
, by default True
|
49 |
+
|
50 |
+
Returns
|
51 |
+
-------
|
52 |
+
str
|
53 |
+
longer chunks stuffed together
|
54 |
+
"""
|
55 |
+
|
56 |
+
chunks = []
|
57 |
+
# NOTE chunks have tokens, long text is a string, but may contain html which affects tokenization
|
58 |
+
long_text = hit.source.get(field_name) or ""
|
59 |
+
long_text = long_text.lower()
|
60 |
+
|
61 |
+
inner_hits_field = f"embeddings.{field_name}.chunks"
|
62 |
+
found_chunks = hit.inner_hits.get(inner_hits_field, {}) if hit.inner_hits else None
|
63 |
+
if found_chunks:
|
64 |
+
for h in found_chunks.get("hits", {}).get("hits") or []:
|
65 |
+
chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]
|
66 |
+
|
67 |
+
# cutting the middle because we may have tokenizing artifacts there
|
68 |
+
chunk = chunk[3: -3]
|
69 |
+
|
70 |
+
if add_context:
|
71 |
+
# Find the start and end indices of the chunk in the large text
|
72 |
+
start_index = long_text.find(chunk[:20])
|
73 |
+
|
74 |
+
# Chunk is found
|
75 |
+
if start_index != -1:
|
76 |
+
end_index = start_index + len(chunk)
|
77 |
+
pre_start_index = max(0, start_index - context_length)
|
78 |
+
post_end_index = min(len(long_text), end_index + context_length)
|
79 |
+
chunks.append(long_text[pre_start_index:post_end_index])
|
80 |
+
else:
|
81 |
+
chunks.append(chunk)
|
82 |
+
return '\n\n'.join(chunks)
|
83 |
+
|
84 |
+
|
85 |
+
def generate_queries(
|
86 |
+
query: str,
|
87 |
+
sources: list[SourceNames],
|
88 |
+
news_days_ago: int = 60
|
89 |
+
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
|
90 |
+
"""Builds Elastic queries against indices which do or do not support sparse vector queries.
|
91 |
+
|
92 |
+
Parameters
|
93 |
+
----------
|
94 |
+
query : str
|
95 |
+
Text describing a user's question or a description of investigative work which requires support from Candid's
|
96 |
+
knowledge base
|
97 |
+
sources : list[SourceNames]
|
98 |
+
One or more sources of knowledge from different areas at Candid.
|
99 |
+
* Candid Blog: Blog posts from Candid staff and trusted partners intended to help those in the sector or
|
100 |
+
illuminate ongoing work
|
101 |
+
* Candid Help: Candid FAQs to help user's get started with Candid's product platform and learning resources
|
102 |
+
* Candid Learning: Training documents from Candid's subject matter experts
|
103 |
+
* Candid News: News articles and press releases about real-time activity in the philanthropic sector
|
104 |
+
* IssueLab Research Reports: Academic research reports about the social/philanthropic sector
|
105 |
+
* YouTube Training: Transcripts from video-based training seminars from Candid's subject matter experts
|
106 |
+
news_days_ago : int, optional
|
107 |
+
How many days in the past to search for news articles, if a user is asking for recent trends then this value
|
108 |
+
should be set lower >~ 10, by default 60
|
109 |
+
|
110 |
+
Returns
|
111 |
+
-------
|
112 |
+
tuple[list[dict[str, Any]], list[dict[str, Any]]]
|
113 |
+
(sparse vector queries, queries for indices which do not support sparse vectors)
|
114 |
+
"""
|
115 |
+
|
116 |
+
vector_queries = []
|
117 |
+
quasi_vector_queries = []
|
118 |
+
|
119 |
+
for source_name in sources:
|
120 |
+
if source_name == "Candid Blog":
|
121 |
+
q = build_sparse_vector_query(query=query, fields=S.CandidBlogConfig.semantic_fields)
|
122 |
+
q["_source"] = {"excludes": ["embeddings"]}
|
123 |
+
q["size"] = 5
|
124 |
+
vector_queries.extend([{"index": S.CandidBlogConfig.index_name}, q])
|
125 |
+
elif source_name == "Candid Help":
|
126 |
+
q = build_sparse_vector_query(query=query, fields=S.CandidHelpConfig.semantic_fields)
|
127 |
+
q["_source"] = {"excludes": ["embeddings"]}
|
128 |
+
q["size"] = 5
|
129 |
+
vector_queries.extend([{"index": S.CandidHelpConfig.index_name}, q])
|
130 |
+
elif source_name == "Candid Learning":
|
131 |
+
q = build_sparse_vector_query(query=query, fields=S.CandidLearningConfig.semantic_fields)
|
132 |
+
q["_source"] = {"excludes": ["embeddings"]}
|
133 |
+
q["size"] = 5
|
134 |
+
vector_queries.extend([{"index": S.CandidLearningConfig.index_name}, q])
|
135 |
+
elif source_name == "Candid News":
|
136 |
+
q = news_query_builder(
|
137 |
+
query=query,
|
138 |
+
fields=S.CandidNewsConfig.semantic_fields,
|
139 |
+
encoder=sparse_encoder,
|
140 |
+
days_ago=news_days_ago
|
141 |
+
)
|
142 |
+
q["size"] = 5
|
143 |
+
quasi_vector_queries.extend([{"index": S.CandidNewsConfig.index_name}, q])
|
144 |
+
elif source_name == "IssueLab Research Reports":
|
145 |
+
# q = build_sparse_vector_query(query=query, fields=S.IssueLabConfig.semantic_fields)
|
146 |
+
# q["_source"] = {"excludes": ["embeddings"]}
|
147 |
+
# q["size"] = 1
|
148 |
+
# vector_queries.extend([{"index": S.IssueLabConfig.index_name}, q])
|
149 |
+
|
150 |
+
q = issuelab_query_builder(
|
151 |
+
query=query,
|
152 |
+
fields=S.IssueLabConfig.semantic_fields,
|
153 |
+
highlight_fields=S.IssueLabConfig.highlight_fields,
|
154 |
+
encoder=sparse_encoder,
|
155 |
+
)
|
156 |
+
q["size"] = 1
|
157 |
+
quasi_vector_queries.extend([{"index": S.IssueLabConfig.index_name}, q])
|
158 |
+
elif source_name == "YouTube Training":
|
159 |
+
q = build_sparse_vector_and_text_query(
|
160 |
+
query=query,
|
161 |
+
semantic_fields=S.YoutubeConfig.semantic_fields,
|
162 |
+
text_fields=S.YoutubeConfig.text_fields,
|
163 |
+
highlight_fields=S.YoutubeConfig.highlight_fields,
|
164 |
+
excluded_fields=S.YoutubeConfig.excluded_fields
|
165 |
+
)
|
166 |
+
q["size"] = 5
|
167 |
+
vector_queries.extend([{"index": S.YoutubeConfig.index_name}, q])
|
168 |
+
|
169 |
+
return vector_queries, quasi_vector_queries
|
170 |
+
|
171 |
+
|
172 |
+
def run_search(
|
173 |
+
vector_searches: list[dict[str, Any]] | None = None,
|
174 |
+
non_vector_searches: list[dict[str, Any]] | None = None,
|
175 |
+
) -> list[ElasticHitsResult]:
|
176 |
+
"""Elastic query runner which executes both sparse vector, and quasi-sparse vector queries and concatenates results.
|
177 |
+
This does not include re-ranking.
|
178 |
+
|
179 |
+
Parameters
|
180 |
+
----------
|
181 |
+
vector_searches : list[dict[str, Any]] | None, optional
|
182 |
+
Sparse vector multi-search queries which , by default None
|
183 |
+
non_vector_searches : list[dict[str, Any]] | None, optional
|
184 |
+
Keyword-based multi-search queries, by default None
|
185 |
+
|
186 |
+
Returns
|
187 |
+
-------
|
188 |
+
list[ElasticHitsResult]
|
189 |
+
Concatenated results
|
190 |
+
"""
|
191 |
+
|
192 |
+
def _msearch_response_generator(responses: Iterable[dict[str, Any]]) -> Iterator[ElasticHitsResult]:
|
193 |
+
for query_group in responses:
|
194 |
+
for h in query_group.get("hits", {}).get("hits", []):
|
195 |
+
inner_hits = h.get("inner_hits", {})
|
196 |
+
|
197 |
+
if not inner_hits and "news" in h.get("_index"):
|
198 |
+
inner_hits = {"text": h.get("_source", {}).get("content")}
|
199 |
+
if not inner_hits and "issuelab" in h.get("_index"):
|
200 |
+
inner_hits = {"text": h.get("_source", {}).get("content")}
|
201 |
+
|
202 |
+
yield ElasticHitsResult(
|
203 |
+
index=h["_index"],
|
204 |
+
id=h["_id"],
|
205 |
+
score=h["_score"],
|
206 |
+
source=h["_source"],
|
207 |
+
inner_hits=inner_hits,
|
208 |
+
highlight=h.get("highlight", {})
|
209 |
+
)
|
210 |
+
|
211 |
+
results = []
|
212 |
+
if vector_searches is not None and len(vector_searches) > 0:
|
213 |
+
hits = multi_search_base(queries=vector_searches, credentials=SEMANTIC_ELASTIC_QA)
|
214 |
+
for hit in _msearch_response_generator(responses=hits):
|
215 |
+
results.append(hit)
|
216 |
+
if non_vector_searches is not None and len(non_vector_searches) > 0:
|
217 |
+
hits = multi_search_base(queries=non_vector_searches, credentials=NEWS_ELASTIC)
|
218 |
+
for hit in _msearch_response_generator(responses=hits):
|
219 |
+
results.append(hit)
|
220 |
+
return results
|
221 |
+
|
222 |
+
|
223 |
+
def retrieved_text(hits: dict[str, Any]) -> str:
|
224 |
+
"""Extracts retrieved sub-texts from documents which are strong hits from semantic queries for the purpose of
|
225 |
+
re-scoring by a secondary language model.
|
226 |
+
|
227 |
+
Parameters
|
228 |
+
----------
|
229 |
+
hits : dict[str, Any]
|
230 |
+
|
231 |
+
Returns
|
232 |
+
-------
|
233 |
+
str
|
234 |
+
"""
|
235 |
+
|
236 |
+
nlp = CandidSmallLanguageModel()
|
237 |
+
|
238 |
+
text = []
|
239 |
+
for _, v in hits.items():
|
240 |
+
if _ == "text":
|
241 |
+
s = nlp.summarize(v, top_k=3)
|
242 |
+
text.append(s.summary)
|
243 |
+
# text.append(v)
|
244 |
+
continue
|
245 |
+
|
246 |
+
for h in (v.get("hits", {}).get("hits") or []):
|
247 |
+
for _, field in h.get("fields", {}).items():
|
248 |
+
for chunk in field:
|
249 |
+
if chunk.get("chunk"):
|
250 |
+
text.extend(chunk["chunk"])
|
251 |
+
return '\n'.join(text)
|
252 |
+
|
253 |
+
|
254 |
+
def reranker(
|
255 |
+
query_results: Iterable[ElasticHitsResult],
|
256 |
+
search_text: str | None = None,
|
257 |
+
max_num_results: int = 5
|
258 |
+
) -> Iterator[ElasticHitsResult]:
|
259 |
+
"""Reranks Elasticsearch hits coming from multiple indices/queries which may have scores on different scales.
|
260 |
+
This will shuffle results
|
261 |
+
|
262 |
+
Parameters
|
263 |
+
----------
|
264 |
+
query_results : Iterable[ElasticHitsResult]
|
265 |
+
|
266 |
+
Yields
|
267 |
+
------
|
268 |
+
Iterator[ElasticHitsResult]
|
269 |
+
"""
|
270 |
+
|
271 |
+
results: list[ElasticHitsResult] = []
|
272 |
+
texts: list[str] = []
|
273 |
+
for _, data in groupby(query_results, key=lambda x: x.index):
|
274 |
+
data = list(data) # noqa: PLW2901
|
275 |
+
max_score = max(data, key=lambda x: x.score).score
|
276 |
+
min_score = min(data, key=lambda x: x.score).score
|
277 |
+
|
278 |
+
for d in data:
|
279 |
+
d.score = (d.score - min_score) / (max_score - min_score + 1e-9)
|
280 |
+
results.append(d)
|
281 |
+
|
282 |
+
if search_text:
|
283 |
+
if d.inner_hits:
|
284 |
+
text = retrieved_text(d.inner_hits)
|
285 |
+
if d.highlight:
|
286 |
+
highlight_texts = []
|
287 |
+
for k, v in d.highlight.items():
|
288 |
+
highlight_texts.append('\n'.join(v))
|
289 |
+
text = '\n'.join(highlight_texts)
|
290 |
+
texts.append(text)
|
291 |
+
|
292 |
+
if search_text and len(texts) == len(results) and len(texts) > 1:
|
293 |
+
logger.info("Re-ranking %d retrieval results", len(results))
|
294 |
+
scores = sparse_encoder.query_reranking(query=search_text, documents=texts)
|
295 |
+
for r, s in zip(results, scores):
|
296 |
+
r.score = s
|
297 |
+
|
298 |
+
yield from sorted(results, key=lambda x: x.score, reverse=True)[:max_num_results]
|
299 |
+
|
300 |
+
|
301 |
+
def process_hit(hit: ElasticHitsResult) -> Document:
|
302 |
+
"""Process a raw Elasticsearch document into a structured langchain `Document` object.
|
303 |
+
|
304 |
+
Parameters
|
305 |
+
----------
|
306 |
+
hit : ElasticHitsResult
|
307 |
+
|
308 |
+
Returns
|
309 |
+
-------
|
310 |
+
Document
|
311 |
+
|
312 |
+
Raises
|
313 |
+
------
|
314 |
+
ValueError
|
315 |
+
Raised if a result from an unknown index is passed in
|
316 |
+
"""
|
317 |
+
|
318 |
+
nlp = CandidSmallLanguageModel()
|
319 |
+
|
320 |
+
if "issuelab-elser" in hit.index:
|
321 |
+
doc = Document(
|
322 |
+
page_content='\n\n'.join([
|
323 |
+
hit.source.get("combined_item_description", ""),
|
324 |
+
hit.source.get("description", ""),
|
325 |
+
hit.source.get("combined_issuelab_findings", ""),
|
326 |
+
get_context("content", hit, context_length=12)
|
327 |
+
]),
|
328 |
+
metadata={
|
329 |
+
"title": hit.source["title"],
|
330 |
+
"source": "IssueLab",
|
331 |
+
"source_id": hit.source["resource_id"],
|
332 |
+
"url": hit.source.get("permalink", "")
|
333 |
+
}
|
334 |
+
)
|
335 |
+
elif "issuelab" in hit.index:
|
336 |
+
content_summary = ""
|
337 |
+
if hit.source.get("content", ""):
|
338 |
+
content_summary = nlp.summarize(hit.source.get("content", ""), top_k=20).summary
|
339 |
+
|
340 |
+
doc = Document(
|
341 |
+
page_content='\n\n'.join([hit.source.get("description", ""), content_summary]),
|
342 |
+
metadata={
|
343 |
+
"title": hit.source["title"],
|
344 |
+
"source": "IssueLab",
|
345 |
+
"source_id": hit.source["issuelab_id"],
|
346 |
+
"url": hit.source.get("issuelab_url", "")
|
347 |
+
}
|
348 |
+
)
|
349 |
+
elif "youtube" in hit.index:
|
350 |
+
highlight = hit.highlight or {}
|
351 |
+
doc = Document(
|
352 |
+
page_content='\n\n'.join([
|
353 |
+
hit.source.get("title", ""),
|
354 |
+
hit.source.get("semantic_description", ""),
|
355 |
+
' '.join(highlight.get("semantic_cc_text", []))
|
356 |
+
]),
|
357 |
+
metadata={
|
358 |
+
"title": hit.source.get("title", ""),
|
359 |
+
"source": "Candid YouTube",
|
360 |
+
"source_id": hit.source['video_id'],
|
361 |
+
"url": f"https://www.youtube.com/watch?v={hit.source['video_id']}"
|
362 |
+
}
|
363 |
+
)
|
364 |
+
elif "candid-blog" in hit.index:
|
365 |
+
doc = Document(
|
366 |
+
page_content='\n\n'.join([
|
367 |
+
hit.source.get("title", ""),
|
368 |
+
hit.source.get("excerpt", ""),
|
369 |
+
get_context("content", hit, context_length=12, add_context=False),
|
370 |
+
get_context("authors_text", hit, context_length=12, add_context=False),
|
371 |
+
hit.source.get("title_summary_tags", "")
|
372 |
+
]),
|
373 |
+
metadata={
|
374 |
+
"title": hit.source.get("title", ""),
|
375 |
+
"source": "Candid Blog",
|
376 |
+
"source_id": hit.source["id"],
|
377 |
+
"url": hit.source["link"]
|
378 |
+
}
|
379 |
+
)
|
380 |
+
elif "candid-learning" in hit.index:
|
381 |
+
doc = Document(
|
382 |
+
page_content='\n\n'.join([
|
383 |
+
hit.source.get("title", ""),
|
384 |
+
hit.source.get("staff_recommendations", ""),
|
385 |
+
hit.source.get("training_topics", ""),
|
386 |
+
get_context("content", hit, context_length=12)
|
387 |
+
]),
|
388 |
+
metadata={
|
389 |
+
"title": hit.source["title"],
|
390 |
+
"source": "Candid Learning",
|
391 |
+
"source_id": hit.source["post_id"],
|
392 |
+
"url": hit.source.get("url", "")
|
393 |
+
}
|
394 |
+
)
|
395 |
+
elif "candid-help" in hit.index:
|
396 |
+
doc = Document(
|
397 |
+
page_content='\n\n'.join([
|
398 |
+
hit.source.get("combined_article_description", ""),
|
399 |
+
get_context("content", hit, context_length=12)
|
400 |
+
]),
|
401 |
+
metadata={
|
402 |
+
"title": hit.source.get("title", ""),
|
403 |
+
"source": "Candid Help",
|
404 |
+
"source_id": hit.source["id"],
|
405 |
+
"url": hit.source.get("link", "")
|
406 |
+
}
|
407 |
+
)
|
408 |
+
elif "news" in hit.index:
|
409 |
+
doc = Document(
|
410 |
+
page_content='\n\n'.join([hit.source.get("title", ""), hit.source.get("content", "")]),
|
411 |
+
metadata={
|
412 |
+
"title": hit.source.get("title", ""),
|
413 |
+
"source": hit.source.get("site_name") or "Candid News",
|
414 |
+
"source_id": hit.source["id"],
|
415 |
+
"url": hit.source.get("link", "")
|
416 |
+
}
|
417 |
+
)
|
418 |
+
else:
|
419 |
+
raise ValueError(f"Unknown source result from index {hit.index}")
|
420 |
+
return doc
|
ask_candid/services/small_lm.py
CHANGED
@@ -8,12 +8,16 @@ from ask_candid.base.lambda_base import LambdaInvokeBase
|
|
8 |
|
9 |
@dataclass(slots=True)
|
10 |
class Encoding:
|
|
|
|
|
11 |
inputs: list[str]
|
12 |
vectors: torch.Tensor
|
13 |
|
14 |
|
15 |
@dataclass(slots=True)
|
16 |
class SummaryItem:
|
|
|
|
|
17 |
rank: int
|
18 |
score: float
|
19 |
text: str
|
@@ -21,6 +25,8 @@ class SummaryItem:
|
|
21 |
|
22 |
@dataclass(slots=True)
|
23 |
class TextSummary:
|
|
|
|
|
24 |
snippets: list[SummaryItem]
|
25 |
|
26 |
@property
|
@@ -28,7 +34,7 @@ class TextSummary:
|
|
28 |
return ' '.join([_.text for _ in self.snippets])
|
29 |
|
30 |
|
31 |
-
class
|
32 |
"""Wrapper around Candid's custom small language model.
|
33 |
For more details see https://dev.azure.com/guidestar/DataScience/_git/graph-ai?path=/releases/language.
|
34 |
This services includes:
|
@@ -44,7 +50,7 @@ class CandidSLM(LambdaInvokeBase):
|
|
44 |
AWS secret key, by default None
|
45 |
"""
|
46 |
|
47 |
-
class Tasks(Enum): #
|
48 |
ENCODE = "/encode"
|
49 |
DOCUMENT_SUMMARIZE = "/document/summarize"
|
50 |
DOCUMENT_NER_SALIENCE = "/document/entitySalience"
|
|
|
8 |
|
9 |
@dataclass(slots=True)
|
10 |
class Encoding:
|
11 |
+
"""Text encoding vector response
|
12 |
+
"""
|
13 |
inputs: list[str]
|
14 |
vectors: torch.Tensor
|
15 |
|
16 |
|
17 |
@dataclass(slots=True)
|
18 |
class SummaryItem:
|
19 |
+
"""A single summary object
|
20 |
+
"""
|
21 |
rank: int
|
22 |
score: float
|
23 |
text: str
|
|
|
25 |
|
26 |
@dataclass(slots=True)
|
27 |
class TextSummary:
|
28 |
+
"""Text summarization response
|
29 |
+
"""
|
30 |
snippets: list[SummaryItem]
|
31 |
|
32 |
@property
|
|
|
34 |
return ' '.join([_.text for _ in self.snippets])
|
35 |
|
36 |
|
37 |
+
class CandidSmallLanguageModel(LambdaInvokeBase):
|
38 |
"""Wrapper around Candid's custom small language model.
|
39 |
For more details see https://dev.azure.com/guidestar/DataScience/_git/graph-ai?path=/releases/language.
|
40 |
This services includes:
|
|
|
50 |
AWS secret key, by default None
|
51 |
"""
|
52 |
|
53 |
+
class Tasks(Enum): # noqa: D106
|
54 |
ENCODE = "/encode"
|
55 |
DOCUMENT_SUMMARIZE = "/document/summarize"
|
56 |
DOCUMENT_NER_SALIENCE = "/document/entitySalience"
|
ask_candid/tools/grants.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
from langchain_core.tools import tool
|
4 |
+
import httpx
|
5 |
+
|
6 |
+
from ask_candid.tools.utils import format_candid_profile_link
|
7 |
+
from ask_candid.base.config.rest import SEARCH
|
8 |
+
|
9 |
+
logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
logger.setLevel(logging.ERROR)
|
12 |
+
|
13 |
+
|
14 |
+
@tool
|
15 |
+
def grants_search(
|
16 |
+
query: str,
|
17 |
+
subject_codes: str | None = None,
|
18 |
+
populations_served_codes: str | None = None,
|
19 |
+
geonameids_of_geographies_served: str | None = None
|
20 |
+
) -> list[dict[str, str | int | float | None]] | str:
|
21 |
+
"""Search for historical grants to find context about what is happening in the sector, and what organizations are
|
22 |
+
involved with. This is intended for historial research purposes and contextualization. If trying to recommend
|
23 |
+
funders then please use the dedicated funder recommendation tool instead of this. Funder recommendations uses grants
|
24 |
+
and additional contexts, as well as a carefully trained graph neural network to provide targeted recommendations.
|
25 |
+
|
26 |
+
Another important note is that this tool only returns up to 25 top relevant grant results and should never be used
|
27 |
+
to make broad generalizations.
|
28 |
+
|
29 |
+
Queries are natural text, and the retrieval mechanism is a hybrid approach of keywords and sparse vector searches
|
30 |
+
over fields which describe the activity and purpose of the grant.
|
31 |
+
|
32 |
+
While extra subject codes, populations served codes, and geography IDs for where the grant is serving is not
|
33 |
+
required, grants may become more specific the more information can be provided.
|
34 |
+
|
35 |
+
Subjects and populations can be determined using the `autocode` tool if the requester can supply a description of
|
36 |
+
the program they are seeking funding for.
|
37 |
+
|
38 |
+
Geographies can be determined using the geo detection tool if the requester can supply a description of the program
|
39 |
+
they are seeking funding for.
|
40 |
+
|
41 |
+
Parameters
|
42 |
+
----------
|
43 |
+
query : str
|
44 |
+
Text describing a user's question or a description of investigative work which requires support from Candid's
|
45 |
+
grants knowledge base
|
46 |
+
subject_codes : str | None, optional
|
47 |
+
Subject codes from Candid's PCS taxonomy, comma separated, by default None
|
48 |
+
populations_served_codes : str | None, optional
|
49 |
+
Population groups served codes from Candid's PCS taxonomy, comma separated, by default None
|
50 |
+
geonameids_of_geographies_served : str | None, optional
|
51 |
+
Geonames ID values for geographies served by the requester's program, comma separted, by default None
|
52 |
+
|
53 |
+
Examples
|
54 |
+
--------
|
55 |
+
>>> grants_search(query='homeless shelters in new york')
|
56 |
+
>>> grants_search(
|
57 |
+
query='homeless shelters in new york',
|
58 |
+
subject_codes='SS050000, SS000000,SB050000',
|
59 |
+
populations_served_codes='PJ050100',
|
60 |
+
geonameids_of_geographies_served='4094212,4094212'
|
61 |
+
)
|
62 |
+
|
63 |
+
Returns
|
64 |
+
-------
|
65 |
+
list[dict[str, str | int | float | None]] | str
|
66 |
+
Array of relevant grants and information about the organizations involved
|
67 |
+
If output is a string then that means there was some error, and retry should be considered
|
68 |
+
"""
|
69 |
+
|
70 |
+
payload = {"query": query, "rowCount": 25}
|
71 |
+
if subject_codes is not None:
|
72 |
+
payload["SubjectArea"] = subject_codes.split(',')
|
73 |
+
if populations_served_codes is not None:
|
74 |
+
payload["PopulationServed"] = populations_served_codes.split(',')
|
75 |
+
if geonameids_of_geographies_served:
|
76 |
+
payload["GeographicArea"] = geonameids_of_geographies_served.split(',')
|
77 |
+
|
78 |
+
with httpx.Client(transport=httpx.HTTPTransport(retries=3), timeout=30) as client:
|
79 |
+
r = client.get(
|
80 |
+
url=SEARCH.endpoint("v1/grants/discovery"),
|
81 |
+
params=payload,
|
82 |
+
headers={**SEARCH.header} # type: ignore
|
83 |
+
)
|
84 |
+
|
85 |
+
if r.status_code != 200:
|
86 |
+
logger.error("Error calling grants search API %s. Error: %s", str(r.request.url), r.reason_phrase)
|
87 |
+
return f"Error calling grants search. Error: {r.reason_phrase}"
|
88 |
+
|
89 |
+
data: dict = r.json()
|
90 |
+
|
91 |
+
output = []
|
92 |
+
for grant in data.get("grants") or []:
|
93 |
+
working_on, serving = [], []
|
94 |
+
for facet, data in grant["pcsV3"].items():
|
95 |
+
if facet == "subject":
|
96 |
+
working_on.extend([code["name"].lower() for code in data["value"]])
|
97 |
+
elif facet == "population":
|
98 |
+
serving.extend([code["name"].lower() for code in data["value"]])
|
99 |
+
|
100 |
+
output.append({
|
101 |
+
"funder_id": grant["grantmakerId"],
|
102 |
+
"funder_profile_link": format_candid_profile_link(grant["grantmakerId"]),
|
103 |
+
"funder_name": grant["grantmakerName"],
|
104 |
+
"recipient_id": grant["recipientId"],
|
105 |
+
"recipient_profile_link": format_candid_profile_link(grant["recipientId"]),
|
106 |
+
"recipient_name": grant["recipientName"],
|
107 |
+
"fiscal_year": grant["fiscalYear"],
|
108 |
+
"amound_usd": grant["amountUsd"],
|
109 |
+
"description": grant["text"],
|
110 |
+
"working_on": f"Working on {', '.join(working_on)}",
|
111 |
+
"serving": f"Serving population groups {', '.join(serving)}",
|
112 |
+
})
|
113 |
+
return output
|
ask_candid/tools/letter_gen.py
ADDED
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass, field
|
2 |
+
|
3 |
+
from langchain_core.tools import tool
|
4 |
+
import httpx
|
5 |
+
|
6 |
+
from ask_candid.base.config.rest import GOLDEN_ORG, LOI_WRITER
|
7 |
+
|
8 |
+
|
9 |
+
@dataclass
|
10 |
+
class LetterOfInterest:
|
11 |
+
opening: str = field(default="")
|
12 |
+
org_desc: str = field(default="")
|
13 |
+
need: str = field(default="")
|
14 |
+
project: str = field(default="")
|
15 |
+
request: str = field(default="")
|
16 |
+
conclusion: str = field(default="")
|
17 |
+
|
18 |
+
@property
|
19 |
+
def letter(self):
|
20 |
+
return f"""{self.opening}
|
21 |
+
|
22 |
+
{self.org_desc}
|
23 |
+
|
24 |
+
{self.need}
|
25 |
+
|
26 |
+
{self.project}
|
27 |
+
|
28 |
+
{self.request}
|
29 |
+
|
30 |
+
{self.conclusion}
|
31 |
+
"""
|
32 |
+
|
33 |
+
|
34 |
+
@tool
|
35 |
+
def estimate_budget(
|
36 |
+
nonprofit_id: int,
|
37 |
+
funder_id: int,
|
38 |
+
project_description: str,
|
39 |
+
# ctx: Context
|
40 |
+
) -> str:
|
41 |
+
"""This is an optional tool for estimating project budgets. Some users will already know what their budget is, or
|
42 |
+
know how much money they are seeking from a grant, in which case this tool should not be used.
|
43 |
+
|
44 |
+
This tool also provides guidance on setting a budget, and ultimately the user needs to decided based on the output
|
45 |
+
from this tool
|
46 |
+
|
47 |
+
Parameters
|
48 |
+
----------
|
49 |
+
nonprofit_id : int
|
50 |
+
The unique identifier of the requesting organization. This will need to be found from a search using inputs
|
51 |
+
elicited from the requeter
|
52 |
+
funder_id : int
|
53 |
+
The unique identifier of the funding organization which may be awarding a grant to the requester.
|
54 |
+
This will need to be found from a search using inputs elicited from the requeter, or from recommendations
|
55 |
+
project_description : str
|
56 |
+
Natural language text describing the project/program that the user is requesting funding for
|
57 |
+
|
58 |
+
Returns
|
59 |
+
-------
|
60 |
+
str
|
61 |
+
Budget guidance, including context on the funder's ability to provide the budget in question
|
62 |
+
"""
|
63 |
+
|
64 |
+
recip_data = httpx.get(
|
65 |
+
url=GOLDEN_ORG.endpoint("v1/organization"),
|
66 |
+
params={"id": nonprofit_id},
|
67 |
+
headers={**GOLDEN_ORG.header}, # type: ignore
|
68 |
+
timeout=30
|
69 |
+
).json().get("document_data", {}).get("preferred_data", {}).get("data", {})
|
70 |
+
funder_data = httpx.get(
|
71 |
+
url=GOLDEN_ORG.endpoint("v1/organization"),
|
72 |
+
params={"id": funder_id},
|
73 |
+
headers={**GOLDEN_ORG.header}, # type: ignore
|
74 |
+
timeout=30
|
75 |
+
).json().get("document_data", {}).get("preferred_data", {}).get("data", {})
|
76 |
+
return httpx.post(
|
77 |
+
url=LOI_WRITER.endpoint("budget"),
|
78 |
+
json={
|
79 |
+
"recipient_candid_entity_id": nonprofit_id,
|
80 |
+
"program_description": project_description,
|
81 |
+
"recipient_data": recip_data,
|
82 |
+
"funder_data": funder_data
|
83 |
+
},
|
84 |
+
headers={**LOI_WRITER.header}, # type: ignore
|
85 |
+
timeout=30
|
86 |
+
).json().get("response", "No budget could be estimated")
|
87 |
+
|
88 |
+
|
89 |
+
@tool
|
90 |
+
def draft_loi(
|
91 |
+
nonprofit_id: int,
|
92 |
+
funder_id: int,
|
93 |
+
project_description: str,
|
94 |
+
budget: int,
|
95 |
+
) -> str:
|
96 |
+
"""Generate a letter of interest/intent from a chain-of-thought prompt chain using Candid's golden data and any
|
97 |
+
inputs specified by the user, and/or recommended content.
|
98 |
+
|
99 |
+
The output of this tool is the actual letter draft, please do not make changes to it other than adding headers
|
100 |
+
and/or footers.
|
101 |
+
|
102 |
+
Parameters
|
103 |
+
----------
|
104 |
+
nonprofit_id : int
|
105 |
+
The unique identifier of the requesting organization. This will need to be found from a search using inputs
|
106 |
+
elicited from the requeter
|
107 |
+
funder_id : int
|
108 |
+
The unique identifier of the funding organization which may be awarding a grant to the requester.
|
109 |
+
This will need to be found from a search using inputs elicited from the requeter, or from recommendations
|
110 |
+
project_description : str
|
111 |
+
Natural language text describing the project/program that the user is requesting funding for
|
112 |
+
budget : int
|
113 |
+
The dollar amount (in USD) that the user is requesting for funding. This should be specified by the user,
|
114 |
+
prompt for this if needed.
|
115 |
+
|
116 |
+
Returns
|
117 |
+
-------
|
118 |
+
str
|
119 |
+
Formatted letter of interest
|
120 |
+
"""
|
121 |
+
|
122 |
+
client = httpx.Client(headers={**LOI_WRITER.header}, timeout=30, base_url=LOI_WRITER.url) # type: ignore
|
123 |
+
|
124 |
+
def _make_request(ept: str, payload: dict):
|
125 |
+
# return httpx.get(
|
126 |
+
# url=LOI_WRITER.endpoint(ept),
|
127 |
+
# params=payload,
|
128 |
+
# headers={**LOI_WRITER.header}, # type: ignore
|
129 |
+
# timeout=30
|
130 |
+
# ).json().get("response", "")
|
131 |
+
|
132 |
+
return client.get(url=LOI_WRITER.endpoint(ept), params=payload).json().get("response", "")
|
133 |
+
|
134 |
+
data = _make_request(
|
135 |
+
ept="organization/autofill",
|
136 |
+
payload={"recipient_candid_entity_id": nonprofit_id, "funder_candid_entity_id": funder_id}
|
137 |
+
)
|
138 |
+
|
139 |
+
recip: dict = data.get("recipient_data", {})
|
140 |
+
funder: dict = data.get("funder_data", {})
|
141 |
+
pair_history: str = data.get("funding_history_text", "")
|
142 |
+
|
143 |
+
sections = (
|
144 |
+
("opening", "writer/opening"),
|
145 |
+
("organization description", "writer/org"),
|
146 |
+
("need statement", "writer/need"),
|
147 |
+
("project description", "writer/project"),
|
148 |
+
("funding request", "writer/fund"),
|
149 |
+
("conclusion", "writer/conclusion")
|
150 |
+
)
|
151 |
+
|
152 |
+
output = LetterOfInterest()
|
153 |
+
for _, (section, endpoint) in enumerate(sections, start=1):
|
154 |
+
if section == "opening":
|
155 |
+
output.opening = _make_request(
|
156 |
+
ept=endpoint,
|
157 |
+
payload={
|
158 |
+
"funder_name": [
|
159 |
+
n["name"] for n in funder.get("org_data", {}).get("names", [])
|
160 |
+
if n["name_type"] == "main"
|
161 |
+
][0],
|
162 |
+
"recipient_name": [
|
163 |
+
n["name"] for n in recip.get("org_data", {}).get("names", [])
|
164 |
+
if n["name_type"] == "main"
|
165 |
+
][0],
|
166 |
+
"project_purpose": project_description,
|
167 |
+
"amount": budget,
|
168 |
+
"prior_contact": None,
|
169 |
+
"connection": None
|
170 |
+
}
|
171 |
+
)
|
172 |
+
elif section == "organization description":
|
173 |
+
output.org_desc = _make_request(
|
174 |
+
ept=endpoint,
|
175 |
+
payload={
|
176 |
+
"opening": output.opening,
|
177 |
+
"history": pair_history,
|
178 |
+
"recipient_mission_statement": recip.get("mission_statement_text", ""),
|
179 |
+
"capacity": recip.get("capacity_text", ""),
|
180 |
+
"path": None,
|
181 |
+
"accomplishment": recip.get("data_text", "")
|
182 |
+
}
|
183 |
+
)
|
184 |
+
elif section == "need statement":
|
185 |
+
output.need = httpx.get(
|
186 |
+
url=GOLDEN_ORG.endpoint(endpoint),
|
187 |
+
params={
|
188 |
+
"recipient_desc": output.org_desc,
|
189 |
+
"funder_mission_statement": funder.get("mission_statement_text", ""),
|
190 |
+
"target": None,
|
191 |
+
"data": None,
|
192 |
+
},
|
193 |
+
headers={**GOLDEN_ORG.header}, # type: ignore
|
194 |
+
timeout=30
|
195 |
+
).json().get("response", "")
|
196 |
+
elif section == "project description":
|
197 |
+
output.project = _make_request(
|
198 |
+
ept=endpoint,
|
199 |
+
payload={
|
200 |
+
"need": output.need,
|
201 |
+
"projects": project_description,
|
202 |
+
"desired_objectives": None,
|
203 |
+
"major_activities": None,
|
204 |
+
"key_staff": None,
|
205 |
+
"stand_out": None,
|
206 |
+
"success": None
|
207 |
+
}
|
208 |
+
)
|
209 |
+
elif section == "funding request":
|
210 |
+
output.request = _make_request(
|
211 |
+
ept=endpoint,
|
212 |
+
payload={
|
213 |
+
"project_desc": output.project,
|
214 |
+
"amount": budget,
|
215 |
+
"funding_history": pair_history,
|
216 |
+
"other_funding": None,
|
217 |
+
}
|
218 |
+
)
|
219 |
+
elif section == "conclusion":
|
220 |
+
output.conclusion = _make_request(
|
221 |
+
ept=endpoint,
|
222 |
+
payload={
|
223 |
+
"funding_request": output.request,
|
224 |
+
"project_desc": output.project,
|
225 |
+
"follow_up": recip.get("contact_text", ""),
|
226 |
+
}
|
227 |
+
)
|
228 |
+
|
229 |
+
client.close()
|
230 |
+
return output.letter
|
ask_candid/tools/nlp.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any
|
2 |
+
import logging
|
3 |
+
|
4 |
+
from langchain_core.tools import tool
|
5 |
+
import httpx
|
6 |
+
|
7 |
+
from ask_candid.base.utils import retry_on_status
|
8 |
+
from ask_candid.base.config.rest import AUTOCODING, DOCUMENT
|
9 |
+
|
10 |
+
logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
|
11 |
+
logger = logging.getLogger(__name__)
|
12 |
+
logger.setLevel(logging.ERROR)
|
13 |
+
|
14 |
+
|
15 |
+
@retry_on_status(num_retries=3)
|
16 |
+
def get_with_retries(url: str, payload: dict[str, Any] | None, headers: dict[str, str] | None):
|
17 |
+
with httpx.Client(transport=httpx.HTTPTransport(retries=3), timeout=30) as client:
|
18 |
+
return client.get(url=url, params=payload, headers=headers)
|
19 |
+
|
20 |
+
|
21 |
+
@tool
|
22 |
+
def autocode(text: str) -> dict[str, list] | str:
|
23 |
+
"""Uses natural language processing to align some input text to Candid's taxonomy: https://taxonomy.candid.org.
|
24 |
+
The taxonomy describes activity in the social and philanthropic sectors.
|
25 |
+
|
26 |
+
Parameters
|
27 |
+
----------
|
28 |
+
text : str
|
29 |
+
Text describing working in the social sector. This should be related to the social and/or philanthropic sector.
|
30 |
+
|
31 |
+
Returns
|
32 |
+
-------
|
33 |
+
dict[str, list] | str
|
34 |
+
Taxonomy responses. The keys of the dictionary are individual taxonomy facets, and the items in the dictionary
|
35 |
+
are each term which the NLP model has determined is relevant giving the input text. This also includes
|
36 |
+
confidence score.
|
37 |
+
"""
|
38 |
+
|
39 |
+
r = httpx.get(
|
40 |
+
url=AUTOCODING.endpoint("predict"),
|
41 |
+
params={"text": text},
|
42 |
+
headers={**AUTOCODING.header} # type: ignore
|
43 |
+
)
|
44 |
+
|
45 |
+
if r.status_code != 200:
|
46 |
+
logger.error("Error calling autocoding API %s. Error: %s", str(r.request.url), r.reason_phrase)
|
47 |
+
return f"Error calling autocoding. Error: {r.reason_phrase}"
|
48 |
+
|
49 |
+
data: dict = r.json().get("data", {})
|
50 |
+
return {k: v for k, v in data.items() if k in {"subject", "population"}}
|
51 |
+
|
52 |
+
|
53 |
+
@tool
|
54 |
+
def geo_detect(text: str) -> list[dict[str, Any]] | str:
|
55 |
+
"""Uses natural language processing to find and match named geographies found in the supplied text. The output
|
56 |
+
will supply identified geographies from [Geonames](https://www.geonames.org/).
|
57 |
+
|
58 |
+
Parameters
|
59 |
+
----------
|
60 |
+
text : str
|
61 |
+
Text describing working in the social sector. This should be related to the social and/or philanthropic sector.
|
62 |
+
|
63 |
+
Returns
|
64 |
+
-------
|
65 |
+
list[dict[str, Any]] | str
|
66 |
+
Matched geographies responses. This is an array of JSON objects which contain the `name` of the geography as it
|
67 |
+
appeared in the supplied text, and the best match to a Geonames geography. For many Candid knowledge tools the
|
68 |
+
`geonames_id` value will be most useful.
|
69 |
+
If output is a string then that means there was some error, and retry should be considered
|
70 |
+
"""
|
71 |
+
|
72 |
+
r = get_with_retries(
|
73 |
+
url=DOCUMENT.endpoint("entities/geographies"),
|
74 |
+
payload={"text": text, "only_best_match": True},
|
75 |
+
headers={**DOCUMENT.header}
|
76 |
+
)
|
77 |
+
assert isinstance(r, httpx.Response)
|
78 |
+
if r.status_code != 200:
|
79 |
+
logger.error("Error calling geo detection API %s. Error: %s", str(r.request.url), r.reason_phrase)
|
80 |
+
return f"Error calling geo detection. Error: {r.reason_phrase}"
|
81 |
+
|
82 |
+
data: dict = r.json().get("entities", [])
|
83 |
+
return [{"name": entity["name"], "match": entity["match"][:1]} for entity in data if entity.get("type") == "geo"]
|
ask_candid/tools/recommendations.py
ADDED
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Literal, Any
|
2 |
+
import logging
|
3 |
+
|
4 |
+
from langchain_core.tools import tool
|
5 |
+
import httpx
|
6 |
+
|
7 |
+
from ask_candid.tools.utils import format_candid_profile_link
|
8 |
+
from ask_candid.base.utils import retry_on_status
|
9 |
+
from ask_candid.base.config.rest import FUNDER_RECOMMENDATION, SEARCH
|
10 |
+
|
11 |
+
logging.basicConfig(format="[%(levelname)s] (%(asctime)s) :: %(message)s")
|
12 |
+
logger = logging.getLogger(__name__)
|
13 |
+
logger.setLevel(logging.ERROR)
|
14 |
+
|
15 |
+
|
16 |
+
@retry_on_status(num_retries=3)
|
17 |
+
def get_with_retries(url: str, payload: dict[str, Any] | None, headers: dict[str, str] | None) -> httpx.Response:
|
18 |
+
with httpx.Client(transport=httpx.HTTPTransport(retries=3), timeout=30) as client:
|
19 |
+
return client.get(url=url, params=payload, headers=headers)
|
20 |
+
|
21 |
+
|
22 |
+
@tool
|
23 |
+
def organization_search(
|
24 |
+
query: str,
|
25 |
+
located_postal_code: str | None = None,
|
26 |
+
located_admin1: str | None = None,
|
27 |
+
search_mode: Literal["organization_only", "organization_and_grants"] | None = "organization_only"
|
28 |
+
) -> list[dict[str, str | None]] | str:
|
29 |
+
"""Search for organizations by name, description or work, program descriptions and locations. Here are some
|
30 |
+
guidelines:
|
31 |
+
* `query` controls hybrid searching involving both vector search and keyword search
|
32 |
+
* `query` can be used to find organizations based on a description of work
|
33 |
+
* if the query is intended to be a lookup of an organization by name, then adding quotes around the `query` string
|
34 |
+
circumvents vector search, and prioritizes keyword matching on names (eg. `query=Candid` --> `query='Candid'`)
|
35 |
+
* if the query is an EIN (eg. 12-3456789) then keyword searching is prioritized to get exact matches
|
36 |
+
* adding location information such as postal codes and/or admin1 (state/province abbreviations) will filter results
|
37 |
+
|
38 |
+
This tool should be used as a first step in any downstream task which requires identifying the nonprofit that the
|
39 |
+
user is identifying with. Often, the `nonprofit_id` is required, and that can be found via a search.
|
40 |
+
|
41 |
+
Parameters
|
42 |
+
----------
|
43 |
+
query : str
|
44 |
+
Free text query which drives the search functionality. This uses a hybrid approach of vector and keyword
|
45 |
+
searching, but under certain conditions expressed in the 'guidelines' this may disable vector search.
|
46 |
+
located_postal_code : str | None, optional
|
47 |
+
Postal code of the organization to be searched, if provided, by default None
|
48 |
+
located_admin1 : str | None, optional
|
49 |
+
Admin1 code (state/province abbreviation) of the organization to be searched, if provided, by default None
|
50 |
+
search_mode : Literal["organization_only", "organization_and_grants"] | None, optional
|
51 |
+
Choose how to search for organizations, if `None` or "organization_and_grants" then this will examine evidence
|
52 |
+
at the organization level as well as at the historical grant transaction level capturing activity evidence. For
|
53 |
+
name lookups it is best to use the "organization_only" default value, by default "organization_only"
|
54 |
+
|
55 |
+
Returns
|
56 |
+
-------
|
57 |
+
list[dict[str, str]] | str
|
58 |
+
List of the top organization search results
|
59 |
+
If output is a string then that means there was some error, and retry should be considered
|
60 |
+
"""
|
61 |
+
|
62 |
+
payload = {"query": query, "searchMode": search_mode, "rowCount": 5}
|
63 |
+
if located_postal_code is not None:
|
64 |
+
payload["postalCode"] = located_postal_code
|
65 |
+
if located_admin1 is not None:
|
66 |
+
payload["admin1"] = located_admin1
|
67 |
+
|
68 |
+
with httpx.Client(transport=httpx.HTTPTransport(retries=3), timeout=30) as client:
|
69 |
+
r = client.get(
|
70 |
+
url=SEARCH.endpoint("v1/search"),
|
71 |
+
params=payload,
|
72 |
+
headers={**SEARCH.header} # type: ignore
|
73 |
+
)
|
74 |
+
|
75 |
+
if r.status_code != 200:
|
76 |
+
logger.error("Error calling organization search API %s. Error: %s", str(r.request.url), r.reason_phrase)
|
77 |
+
return f"Error calling organization search. Error: {r.reason_phrase}"
|
78 |
+
|
79 |
+
data: dict = r.json()
|
80 |
+
|
81 |
+
output = []
|
82 |
+
for org in data.get("returnedOrgs") or []:
|
83 |
+
working_on, serving = [], []
|
84 |
+
for code, description in org["taxonomy"].items():
|
85 |
+
code: str
|
86 |
+
description: str
|
87 |
+
|
88 |
+
if code.startswith('P') and len(code) > 2:
|
89 |
+
serving.append(description.lower())
|
90 |
+
elif code.startswith('S'):
|
91 |
+
working_on.append(description.lower())
|
92 |
+
|
93 |
+
output.append({
|
94 |
+
"nonprofit_id": org["candidEntityID"],
|
95 |
+
"name": org["orgName"],
|
96 |
+
"aka_name": org["akaName"],
|
97 |
+
"acronym": org["acronymName"],
|
98 |
+
"city": org["city"],
|
99 |
+
"admin1": org["admin1"],
|
100 |
+
"country": org["countryName"],
|
101 |
+
"EIN": org["ein"],
|
102 |
+
"profile_link": format_candid_profile_link(org['candidEntityID']),
|
103 |
+
"working_on": f"Working on {', '.join(working_on)}",
|
104 |
+
"serving": f"Serving population groups {', '.join(serving)}",
|
105 |
+
"transparency_level": org["seal"].get("description"),
|
106 |
+
"organization_roles": ', '.join(org["roles"]),
|
107 |
+
"grants_awarded": ', '.join([f"{k}: {v}" for k, v in org["transactionsGiven"].items()]),
|
108 |
+
"grants_received": ', '.join([f"{k}: {v}" for k, v in org["transactionsReceived"].items()])
|
109 |
+
})
|
110 |
+
return output
|
111 |
+
|
112 |
+
|
113 |
+
@tool
|
114 |
+
def recommend_funders(
|
115 |
+
nonprofit_id: int,
|
116 |
+
subject_codes_of_program: str | None = None,
|
117 |
+
populations_served_codes_of_program: str | None = None,
|
118 |
+
geonameids_of_geographies_served: str | None = None,
|
119 |
+
include_past_funders: bool = False
|
120 |
+
) -> tuple[dict[str, Any], list[dict[str, Any]]] | str:
|
121 |
+
"""Recommend potential funding organizations to a nonprofit seeking a grant.
|
122 |
+
|
123 |
+
These recommendations are built using machine learning over a heterogeneous knowledge graph representing the work of
|
124 |
+
the requesting organization, and the contextual recent activities of potential funders, and their grant recipients.
|
125 |
+
|
126 |
+
While extra subject codes, populations served codes, and geography IDs for where the program takes place is not
|
127 |
+
required, recommendations tend to improve and become more specific the more information can be provided.
|
128 |
+
|
129 |
+
Subjects and populations can be determined using the `autocode` tool if the requester can supply a description of
|
130 |
+
the program they are seeking funding for.
|
131 |
+
|
132 |
+
Geographies can be determined using the geo detection tool if the requester can supply a description of the program
|
133 |
+
they are seeking funding for.
|
134 |
+
|
135 |
+
Key Usage Requirements:
|
136 |
+
- Always incorporate returned profile URLs directly into the response text
|
137 |
+
- Replace funding organization name mentions with hyperlinked Candid profile URLs
|
138 |
+
- Prioritize creating a seamless user experience by making URLs contextually relevant
|
139 |
+
- Use relevant recipient data as well as inferred metadata to provide explanations about recommendation relevance
|
140 |
+
|
141 |
+
Parameters
|
142 |
+
----------
|
143 |
+
nonprofit_id : int
|
144 |
+
The unique identifier of the requesting organization. This will need to be found from a search using inputs
|
145 |
+
elicited from the requester
|
146 |
+
subject_codes_of_program : str | None, optional
|
147 |
+
Subject codes from Candid's PCS taxonomy, comma separated, by default None
|
148 |
+
populations_served_codes_of_program : str | None, optional
|
149 |
+
Population groups served codes from Candid's PCS taxonomy, comma separated, by default None
|
150 |
+
geonameids_of_geographies_served : str | None, optional
|
151 |
+
Geonames ID values for geographies served by the requester's program, comma separted, by default None
|
152 |
+
include_past_funders : bool, optional
|
153 |
+
Boolean flag to indicate whether previous funders of the input organization identified by the `nonprofit_id`
|
154 |
+
should be excluded. If the requester would like to reconsider previous funding organizations then set this to
|
155 |
+
`True`, but the requester MUST be prompted to indicate this preference. Using the default value will help the
|
156 |
+
requester discover new, potentially relevant funders, by default False
|
157 |
+
|
158 |
+
Examples
|
159 |
+
--------
|
160 |
+
>>> recommend_funders(nonprofit_id=9981881)
|
161 |
+
>>> reccommend_funders(
|
162 |
+
nonprofit_id=9173173,
|
163 |
+
subject_codes_of_program='SS050000, SS000000,SB050000',
|
164 |
+
populations_served_codes_of_program='PJ050100',
|
165 |
+
geonameids_of_geographies_served='4094212,4094212'
|
166 |
+
)
|
167 |
+
|
168 |
+
Returns
|
169 |
+
-------
|
170 |
+
tuple[dict[str, Any], list[dict[str, Any]]] | str
|
171 |
+
(Inferred data used to generate recommendations, array of funders being recommended)
|
172 |
+
If output is a string then that means there was some error, and retry should be considered
|
173 |
+
"""
|
174 |
+
|
175 |
+
payload = {
|
176 |
+
"candid_entity_id": nonprofit_id,
|
177 |
+
"use_programs": True,
|
178 |
+
"top_k": 5,
|
179 |
+
"include_past_funders": include_past_funders
|
180 |
+
}
|
181 |
+
|
182 |
+
if subject_codes_of_program is not None:
|
183 |
+
payload["subjects"] = subject_codes_of_program
|
184 |
+
if populations_served_codes_of_program is not None:
|
185 |
+
payload["populations"] = populations_served_codes_of_program
|
186 |
+
if geonameids_of_geographies_served:
|
187 |
+
payload["geos"] = geonameids_of_geographies_served
|
188 |
+
|
189 |
+
r = get_with_retries(
|
190 |
+
url=FUNDER_RECOMMENDATION.endpoint("funder/pcs-v3"),
|
191 |
+
payload=payload,
|
192 |
+
headers={**FUNDER_RECOMMENDATION.header}
|
193 |
+
)
|
194 |
+
assert isinstance(r, httpx.Response)
|
195 |
+
if r.status_code != 200:
|
196 |
+
logger.error("Error calling funder recommendations API %s. Error: %s", str(r.request.url), r.reason_phrase)
|
197 |
+
return f"Error calling funder recommendations. Error: {r.reason_phrase}"
|
198 |
+
|
199 |
+
data: dict = r.json()
|
200 |
+
return (
|
201 |
+
data.get("meta") or {},
|
202 |
+
[{
|
203 |
+
**r,
|
204 |
+
"candid_profile_url": format_candid_profile_link(r['funder_id'])
|
205 |
+
} for r in (data.get("recommendations") or [])]
|
206 |
+
)
|
207 |
+
|
208 |
+
|
209 |
+
@tool
|
210 |
+
def recommend_funding_opportunities(
|
211 |
+
nonprofit_id: int,
|
212 |
+
subject_codes_of_program: str | None = None,
|
213 |
+
populations_served_codes_of_program: str | None = None,
|
214 |
+
geonameids_of_geographies_served: str | None = None
|
215 |
+
) -> tuple[dict[str, Any], list[dict[str, Any]]] | str:
|
216 |
+
"""Recommend active funding opportunities (RFPs) to a nonprofit seeking a grant.
|
217 |
+
|
218 |
+
These recommendations are built using machine learning over a heterogeneous knowledge graph representing the work of
|
219 |
+
the requesting organization, and the contextual recent activities of potential funders, and their grant recipients.
|
220 |
+
|
221 |
+
While extra subject codes, populations served codes, and geography IDs for where the program takes place is not
|
222 |
+
required, recommendations tend to improve and become more specific the more information can be provided.
|
223 |
+
|
224 |
+
Subjects and populations can be determined using the `autocode` tool if the requester can supply a description of
|
225 |
+
the program they are seeking funding for.
|
226 |
+
|
227 |
+
Key Usage Requirements:
|
228 |
+
- Always incorporate returned profile URLs directly into the response text
|
229 |
+
- Replace funding organization name mentions with hyperlinked Candid profile URLs
|
230 |
+
- Prioritize creating a seamless user experience by making URLs contextually relevant
|
231 |
+
- Use inferred metadata to provide explanations about recommendation relevance
|
232 |
+
|
233 |
+
Parameters
|
234 |
+
----------
|
235 |
+
nonprofit_id : int
|
236 |
+
The unique identifier of the requesting organization. This will need to be found from a search using inputs
|
237 |
+
elicited from the requeter
|
238 |
+
subject_codes_of_program : str | None, optional
|
239 |
+
Subject codes from Candid's PCS taxonomy, comma separated, by default None
|
240 |
+
populations_served_codes_of_program : str | None, optional
|
241 |
+
Population groups served codes from Candid's PCS taxonomy, comma separated, by default None
|
242 |
+
geonameids_of_geographies_served : str | None, optional
|
243 |
+
Geonames ID values for geographies served by the requester's program, comma separted, by default None
|
244 |
+
|
245 |
+
Examples
|
246 |
+
--------
|
247 |
+
>>> recommend_funding_opportunities(nonprofit_id=9981881)
|
248 |
+
>>> recommend_funding_opportunities(
|
249 |
+
nonprofit_id=9173173,
|
250 |
+
subject_codes_of_program='SS050000, SS000000,SB050000',
|
251 |
+
populations_served_codes_of_program='PJ050100',
|
252 |
+
geonameids_of_geographies_served='4094212,4094212'
|
253 |
+
)
|
254 |
+
|
255 |
+
Returns
|
256 |
+
-------
|
257 |
+
tuple[dict[str, Any], list[dict[str, Any]]] | str
|
258 |
+
(Inferred data used to generate recommendations, array of active funding opportunities being recommended)
|
259 |
+
If output is a string then that means there was some error, and retry should be considered
|
260 |
+
"""
|
261 |
+
|
262 |
+
payload = {"candid_entity_id": nonprofit_id, "use_programs": True, "top_k": 5}
|
263 |
+
if subject_codes_of_program is not None:
|
264 |
+
payload["subjects"] = subject_codes_of_program
|
265 |
+
if populations_served_codes_of_program is not None:
|
266 |
+
payload["populations"] = populations_served_codes_of_program
|
267 |
+
if geonameids_of_geographies_served:
|
268 |
+
payload["geos"] = geonameids_of_geographies_served
|
269 |
+
|
270 |
+
r = get_with_retries(
|
271 |
+
url=FUNDER_RECOMMENDATION.endpoint("rfp/pcs-v3"),
|
272 |
+
payload=payload,
|
273 |
+
headers={**FUNDER_RECOMMENDATION.header}
|
274 |
+
)
|
275 |
+
assert isinstance(r, httpx.Response)
|
276 |
+
if r.status_code != 200:
|
277 |
+
logger.error("Error calling RFP recommendation API %s. Error: %s", str(r.request.url), r.reason_phrase)
|
278 |
+
return f"Error calling RFP recommendations. Error: {r.reason_phrase}"
|
279 |
+
|
280 |
+
data: dict = r.json()
|
281 |
+
return (
|
282 |
+
data.get("meta") or {},
|
283 |
+
[{
|
284 |
+
**r,
|
285 |
+
"candid_profile_url": format_candid_profile_link(r['funder_id'])
|
286 |
+
} for r in (data.get("recommendations") or [])]
|
287 |
+
)
|
ask_candid/tools/search.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from langchain_core.documents import Document
|
2 |
from langchain_core.tools import tool
|
3 |
|
4 |
-
from ask_candid.
|
5 |
SourceNames,
|
6 |
generate_queries,
|
7 |
run_search,
|
|
|
1 |
from langchain_core.documents import Document
|
2 |
from langchain_core.tools import tool
|
3 |
|
4 |
+
from ask_candid.services.knowledge_base import (
|
5 |
SourceNames,
|
6 |
generate_queries,
|
7 |
run_search,
|
ask_candid/utils.py
CHANGED
@@ -1,47 +1,15 @@
|
|
1 |
-
from typing import
|
2 |
from uuid import uuid4
|
3 |
|
4 |
from langchain_core.documents import Document
|
5 |
|
6 |
-
from ask_candid.retrieval.sources import (
|
7 |
-
candid_blog,
|
8 |
-
candid_help,
|
9 |
-
candid_learning,
|
10 |
-
issuelab,
|
11 |
-
youtube
|
12 |
-
)
|
13 |
|
14 |
-
|
15 |
-
def html_format_doc(doc: Dict[str, Any], source: str, show_chunks=False) -> str:
|
16 |
-
height_px = 200
|
17 |
-
html = ""
|
18 |
-
|
19 |
-
if source == "news":
|
20 |
-
# html = news.article_card_html(doc, height_px, show_chunks)
|
21 |
-
pass
|
22 |
-
elif source == "transactions":
|
23 |
-
pass
|
24 |
-
elif source == "organizations":
|
25 |
-
pass
|
26 |
-
elif source == "issuelab":
|
27 |
-
html = issuelab.issuelab_card_html(doc, height_px, show_chunks)
|
28 |
-
elif source == "youtube":
|
29 |
-
html = youtube.build_card_html(doc, 400, show_chunks)
|
30 |
-
elif source == "candid_blog":
|
31 |
-
html = candid_blog.build_card_html(doc, height_px, show_chunks)
|
32 |
-
elif source == "candid_learning":
|
33 |
-
html = candid_learning.build_card_html(doc, height_px, show_chunks)
|
34 |
-
elif source == "candid_help":
|
35 |
-
html = candid_help.build_card_html(doc, height_px, show_chunks)
|
36 |
-
return html
|
37 |
-
|
38 |
-
|
39 |
-
def html_format_docs_chat(docs: List[Document]) -> str:
|
40 |
"""Formats Candid sources
|
41 |
|
42 |
Parameters
|
43 |
----------
|
44 |
-
docs :
|
45 |
Retrieved documents for context
|
46 |
|
47 |
Returns
|
@@ -69,7 +37,7 @@ def html_format_docs_chat(docs: List[Document]) -> str:
|
|
69 |
return html
|
70 |
|
71 |
|
72 |
-
def format_chat_ag_response(chatbot:
|
73 |
"""If we called retriever, we appended sources as as one more message. Here we concatinate HTML of sources
|
74 |
with the AI response
|
75 |
Returns:
|
@@ -89,7 +57,7 @@ def valid_inputs(*args) -> bool:
|
|
89 |
return any(a is not None or (isinstance(a, str) and a.strip() != '') for a in args)
|
90 |
|
91 |
|
92 |
-
def get_session_id(thread_id:
|
93 |
if not thread_id:
|
94 |
thread_id = uuid4().hex
|
95 |
return thread_id
|
|
|
1 |
+
from typing import Any
|
2 |
from uuid import uuid4
|
3 |
|
4 |
from langchain_core.documents import Document
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
def html_format_docs_chat(docs: list[Document]) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
"""Formats Candid sources
|
9 |
|
10 |
Parameters
|
11 |
----------
|
12 |
+
docs : list[Document]
|
13 |
Retrieved documents for context
|
14 |
|
15 |
Returns
|
|
|
37 |
return html
|
38 |
|
39 |
|
40 |
+
def format_chat_ag_response(chatbot: list[Any]) -> list[Any]:
|
41 |
"""If we called retriever, we appended sources as as one more message. Here we concatinate HTML of sources
|
42 |
with the AI response
|
43 |
Returns:
|
|
|
57 |
return any(a is not None or (isinstance(a, str) and a.strip() != '') for a in args)
|
58 |
|
59 |
|
60 |
+
def get_session_id(thread_id: str | None) -> str:
|
61 |
if not thread_id:
|
62 |
thread_id = uuid4().hex
|
63 |
return thread_id
|
requirements.txt
CHANGED
@@ -9,7 +9,7 @@ langgraph-prebuilt==0.6.4
|
|
9 |
pydantic==2.10.6
|
10 |
pyopenssl>22.0.0
|
11 |
python-dotenv
|
12 |
-
transformers
|
13 |
|
14 |
--find-links https://download.pytorch.org/whl/cpu
|
15 |
torch
|
|
|
9 |
pydantic==2.10.6
|
10 |
pyopenssl>22.0.0
|
11 |
python-dotenv
|
12 |
+
transformers>=4.56.1
|
13 |
|
14 |
--find-links https://download.pytorch.org/whl/cpu
|
15 |
torch
|