mgbam commited on
Commit
db31f82
Β·
verified Β·
1 Parent(s): e5581c3

Update genesis/tools.py

Browse files
Files changed (1) hide show
  1. genesis/tools.py +27 -151
genesis/tools.py CHANGED
@@ -1,151 +1,27 @@
1
- from __future__ import annotations
2
- import os, json
3
- import httpx
4
- from typing import Any, Dict, List
5
-
6
- class ToolBase:
7
- name: str = "tool"
8
- description: str = ""
9
- async def call(self, *args, **kwargs) -> Dict[str, Any]:
10
- raise NotImplementedError
11
-
12
- # β€” Ontology normalization (BioPortal)
13
- class OntologyTool(ToolBase):
14
- name = "ontology_normalize"
15
- description = "Normalize biomedical terms via BioPortal; returns concept info (no protocols)."
16
-
17
- def __init__(self, timeout: float = 20.0):
18
- self.http = httpx.AsyncClient(timeout=timeout)
19
- self.bioportal_key = os.getenv("BIOPORTAL_API_KEY")
20
-
21
- async def call(self, term: str) -> dict:
22
- out = {"term": term, "bioportal": None}
23
- try:
24
- if self.bioportal_key:
25
- r = await self.http.get(
26
- "https://data.bioontology.org/search",
27
- params={"q": term, "pagesize": 5},
28
- headers={"Authorization": f"apikey token={self.bioportal_key}"},
29
- )
30
- out["bioportal"] = r.json()
31
- except Exception as e:
32
- out["bioportal_error"] = str(e)
33
- return out
34
-
35
- # β€” PubMed search (NCBI E-utilities)
36
- class PubMedTool(ToolBase):
37
- name = "pubmed_search"
38
- description = "Search PubMed via NCBI; return metadata with citations."
39
-
40
- def __init__(self, timeout: float = 20.0):
41
- self.http = httpx.AsyncClient(timeout=timeout)
42
- self.key = os.getenv("NCBI_API_KEY")
43
- self.email = os.getenv("NCBI_EMAIL")
44
-
45
- async def call(self, query: str) -> dict:
46
- base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
47
- try:
48
- es = await self.http.get(
49
- base + "esearch.fcgi",
50
- params={"db":"pubmed","term":query,"retmode":"json","retmax":20,"api_key":self.key,"email":self.email},
51
- )
52
- ids = es.json().get("esearchresult", {}).get("idlist", [])
53
- if not ids:
54
- return {"query": query, "results": []}
55
- su = await self.http.get(
56
- base + "esummary.fcgi",
57
- params={"db":"pubmed","id":",".join(ids),"retmode":"json","api_key":self.key,"email":self.email},
58
- )
59
- recs = su.json().get("result", {})
60
- items = []
61
- for pmid in ids:
62
- r = recs.get(pmid, {})
63
- items.append({
64
- "pmid": pmid,
65
- "title": r.get("title"),
66
- "journal": r.get("fulljournalname"),
67
- "year": (r.get("pubdate") or "")[:4],
68
- "authors": [a.get("name") for a in r.get("authors", [])],
69
- })
70
- return {"query": query, "results": items}
71
- except Exception as e:
72
- return {"query": query, "error": str(e)}
73
-
74
- # β€” RCSB structure metadata
75
- class StructureTool(ToolBase):
76
- name = "structure_info"
77
- description = "Query RCSB structure metadata (no lab steps)."
78
-
79
- def __init__(self, timeout: float = 20.0):
80
- self.http = httpx.AsyncClient(timeout=timeout)
81
-
82
- async def call(self, pdb_id: str) -> dict:
83
- out = {"pdb_id": pdb_id}
84
- try:
85
- r = await self.http.get(f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}")
86
- r.raise_for_status()
87
- out["rcsb_core"] = r.json()
88
- except Exception as e:
89
- out["error"] = str(e)
90
- return out
91
-
92
- # β€” Crossref DOIs
93
- class CrossrefTool(ToolBase):
94
- name = "crossref_search"
95
- description = "Crossref search for DOIs; titles, years, authors."
96
-
97
- def __init__(self, timeout: float = 20.0):
98
- self.http = httpx.AsyncClient(timeout=timeout)
99
-
100
- async def call(self, query: str) -> dict:
101
- try:
102
- r = await self.http.get("https://api.crossref.org/works", params={"query": query, "rows": 10})
103
- items = r.json().get("message", {}).get("items", [])
104
- papers = []
105
- for it in items:
106
- papers.append({
107
- "title": (it.get("title") or [None])[0],
108
- "doi": it.get("DOI"),
109
- "year": (it.get("issued") or {}).get("date-parts", [[None]])[0][0],
110
- "authors": [f"{a.get('given','')} {a.get('family','')}".strip() for a in it.get("author", [])],
111
- })
112
- return {"query": query, "results": papers}
113
- except Exception as e:
114
- return {"query": query, "error": str(e)}
115
-
116
- # β€” HF Inference API Reranker (optional)
117
- class HFRerankTool(ToolBase):
118
- name = "hf_rerank"
119
- description = "Rerank documents using a Hugging Face reranker model (API)."
120
-
121
- def __init__(self, model_id: str):
122
- self.model = model_id
123
- self.hf_token = os.getenv("HF_TOKEN")
124
- self.http = httpx.AsyncClient(timeout=30.0)
125
-
126
- async def call(self, query: str, documents: List[str]) -> dict:
127
- if not self.hf_token:
128
- return {"error": "HF_TOKEN not set"}
129
- try:
130
- # Generic payload; different models may expect different schemas β€” keep robust.
131
- payload = {"inputs": {"query": query, "texts": documents}}
132
- r = await self.http.post(
133
- f"https://api-inference.huggingface.co/models/{self.model}",
134
- headers={"Authorization": f"Bearer {self.hf_token}"},
135
- json=payload,
136
- )
137
- data = r.json()
138
- # Try to interpret scores
139
- scores = []
140
- if isinstance(data, dict) and "scores" in data:
141
- scores = data["scores"]
142
- elif isinstance(data, list) and data and isinstance(data[0], dict) and "score" in data[0]:
143
- scores = [x.get("score", 0.0) for x in data]
144
- else:
145
- # Fallback: equal scores
146
- scores = [1.0 for _ in documents]
147
- # Sort indices by score desc
148
- order = sorted(range(len(documents)), key=lambda i: scores[i], reverse=True)
149
- return {"order": order, "scores": scores, "raw": data}
150
- except Exception as e:
151
- return {"error": str(e)}
 
1
+ # genesis/tools.py
2
+ """
3
+ Utility functions for GENESIS-AI
4
+ """
5
+
6
+ import re
7
+ from datetime import datetime
8
+
9
+ def extract_citations(text):
10
+ """Extracts DOI, PMID, and URLs from text."""
11
+ citations = []
12
+ doi_pattern = r"(10\.\d{4,9}/[-._;()/:A-Z0-9]+)"
13
+ pmid_pattern = r"PMID:\s*(\d+)"
14
+ url_pattern = r"(https?://[^\s)]+)"
15
+
16
+ for match in re.finditer(doi_pattern, text, re.IGNORECASE):
17
+ citations.append({"type": "DOI", "id": match.group(1), "url": f"https://doi.org/{match.group(1)}"})
18
+ for match in re.finditer(pmid_pattern, text, re.IGNORECASE):
19
+ citations.append({"type": "PMID", "id": match.group(1), "url": f"https://pubmed.ncbi.nlm.nih.gov/{match.group(1)}/"})
20
+ for match in re.finditer(url_pattern, text, re.IGNORECASE):
21
+ if not any(c["url"] == match.group(1) for c in citations):
22
+ citations.append({"type": "URL", "id": "", "url": match.group(1)})
23
+ return citations
24
+
25
+ def timestamp():
26
+ """Returns current UTC timestamp as ISO string."""
27
+ return datetime.utcnow().isoformat()