|
|
|
"""MedGenesis – knowledge‑graph builder for Streamlit‑Agraph. |
|
|
|
This version recognises **all new enrichment layers** introduced in the |
|
latest orchestrator: |
|
• UMLS concepts → green nodes |
|
• MyGene / NCBI gene hits → purple nodes |
|
• openFDA / DrugCentral drugs → orange nodes |
|
• ClinicalTrials.gov studies → pink nodes |
|
• Open Targets associations → red drug–gene / gene–disease edges |
|
• Literature papers → blue nodes (tooltip = title) |
|
|
|
The entry‑point `build_agraph` now receives a richer payload and returns |
|
*(nodes, edges, config)* ready for `streamlit_agraph.agraph`. |
|
""" |
|
from __future__ import annotations |
|
|
|
import re |
|
from typing import List, Dict, Tuple |
|
|
|
from streamlit_agraph import Node, Edge, Config |
|
|
|
|
|
|
|
|
|
C_PAPER = "#0984e3" |
|
C_CONCEPT = "#00b894" |
|
C_GENE = "#6c5ce7" |
|
C_DRUG = "#d35400" |
|
C_TRIAL = "#fd79a8" |
|
C_OT_EDGE = "#c0392b" |
|
|
|
|
|
|
|
|
|
|
|
|
|
def _add_node(nodes: List[Node], node_id: str, label: str, color: str, tooltip: str | None = None, size: int = 25): |
|
"""Append Node only if id not yet present (agraph duplicates crash).""" |
|
if any(n.id == node_id for n in nodes): |
|
return |
|
nodes.append(Node(id=node_id, label=label, color=color, size=size, tooltip=tooltip)) |
|
|
|
|
|
def _match(text: str, pattern: str) -> bool: |
|
return bool(re.search(re.escape(pattern), text, flags=re.I)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_agraph( |
|
papers: List[Dict], |
|
umls: List[Dict], |
|
drug_safety: List[Dict], |
|
genes: List[Dict] | None = None, |
|
trials: List[Dict] | None = None, |
|
ot_associations: List[Dict] | None = None, |
|
): |
|
"""Return (nodes, edges, config) for streamlit_agraph. Safe‑duplicates. |
|
|
|
Parameters |
|
---------- |
|
papers : PubMed / arXiv merged list (dicts with title & summary). |
|
umls : List of UMLS concept dicts `{cui, name}`. |
|
drug_safety : openFDA / DrugCentral outputs (mixed dict / list). |
|
genes : Optional list with MyGene/NCBI dicts (symbol, name,...). |
|
trials : Optional ClinicalTrials.gov v2 studies list. |
|
ot_associations : Optional list from Open Targets. |
|
""" |
|
|
|
nodes: List[Node] = [] |
|
edges: List[Edge] = [] |
|
|
|
|
|
for c in umls: |
|
cui, name = c.get("cui"), c.get("name", "") |
|
if cui and name: |
|
cid = f"concept_{cui}" |
|
_add_node(nodes, cid, name, C_CONCEPT) |
|
|
|
|
|
genes = genes or [] |
|
for g in genes: |
|
sym = g.get("symbol") or g.get("name") |
|
gid = f"gene_{sym}" |
|
tooltip = g.get("summary", "") |
|
_add_node(nodes, gid, sym, C_GENE, tooltip=tooltip) |
|
|
|
|
|
drug_tuples: List[Tuple[str, str]] = [] |
|
for i, dr in enumerate(drug_safety): |
|
recs = dr if isinstance(dr, list) else [dr] |
|
for j, rec in enumerate(recs): |
|
name = ( |
|
rec.get("drug_name") or |
|
rec.get("patient", {}).get("drug") or |
|
rec.get("medicinalproduct") or |
|
f"drug_{i}_{j}" |
|
) |
|
did = f"drug_{i}_{j}" |
|
drug_tuples.append((did, name)) |
|
_add_node(nodes, did, name, C_DRUG) |
|
|
|
|
|
trials = trials or [] |
|
for t in trials: |
|
nct = t.get("nctId") or t.get("nctid") |
|
if not nct: |
|
continue |
|
tid = f"trial_{nct}" |
|
label = nct |
|
tooltip = t.get("briefTitle") or "Clinical trial" |
|
_add_node(nodes, tid, label, C_TRIAL, tooltip=tooltip, size=20) |
|
|
|
|
|
for idx, p in enumerate(papers): |
|
pid = f"paper_{idx}" |
|
_add_node(nodes, pid, f"P{idx+1}", C_PAPER, tooltip=p.get("title", ""), size=15) |
|
|
|
text_blob = f"{p.get('title','')} {p.get('summary','')}".lower() |
|
|
|
|
|
for c in umls: |
|
if c.get("name") and _match(text_blob, c["name"]): |
|
edges.append(Edge(source=pid, target=f"concept_{c['cui']}", label="mentions")) |
|
|
|
for g in genes: |
|
if g.get("symbol") and _match(text_blob, g["symbol"]): |
|
edges.append(Edge(source=pid, target=f"gene_{g['symbol']}", label="mentions")) |
|
|
|
for did, dname in drug_tuples: |
|
if _match(text_blob, dname): |
|
edges.append(Edge(source=pid, target=did, label="mentions")) |
|
|
|
|
|
if ot_associations: |
|
for row in ot_associations: |
|
gsym = row.get("target", {}).get("symbol") |
|
dis = row.get("disease", {}).get("name") |
|
score = row.get("score", 0) |
|
if gsym and dis: |
|
gid = f"gene_{gsym}" |
|
did = f"disease_{dis}" |
|
_add_node(nodes, did, dis, C_CONCEPT, size=20) |
|
edges.append(Edge(source=gid, target=did, color=C_OT_EDGE, label=f"OT {score:.2f}")) |
|
|
|
|
|
cfg = Config( |
|
directed=False, |
|
width="100%", |
|
height="600", |
|
nodeHighlightBehavior=True, |
|
highlightColor="#f1c40f", |
|
collapsible=True, |
|
showLegend=False, |
|
node={"labelProperty": "label"}, |
|
) |
|
|
|
return nodes, edges, cfg |
|
|