File size: 6,085 Bytes
633ba95 3d1def9 f3dd8bc 633ba95 3d1def9 633ba95 3d1def9 f3dd8bc 633ba95 a392df0 633ba95 a392df0 633ba95 f3dd8bc 633ba95 f3dd8bc 633ba95 f3dd8bc 633ba95 f3dd8bc 633ba95 f3dd8bc 633ba95 f3dd8bc 633ba95 f3dd8bc 633ba95 a392df0 633ba95 f3dd8bc 633ba95 f3dd8bc 633ba95 a392df0 633ba95 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
#!/usr/bin/env python3
"""MedGenesis – knowledge‑graph builder for Streamlit‑Agraph.
This version recognises **all new enrichment layers** introduced in the
latest orchestrator:
• UMLS concepts → green nodes
• MyGene / NCBI gene hits → purple nodes
• openFDA / DrugCentral drugs → orange nodes
• ClinicalTrials.gov studies → pink nodes
• Open Targets associations → red drug–gene / gene–disease edges
• Literature papers → blue nodes (tooltip = title)
The entry‑point `build_agraph` now receives a richer payload and returns
*(nodes, edges, config)* ready for `streamlit_agraph.agraph`.
"""
from __future__ import annotations
import re
from typing import List, Dict, Tuple
from streamlit_agraph import Node, Edge, Config
# ---------------------------------------------------------------------
# Colour palette (flat‑UI)
# ---------------------------------------------------------------------
C_PAPER = "#0984e3"
C_CONCEPT = "#00b894"
C_GENE = "#6c5ce7"
C_DRUG = "#d35400"
C_TRIAL = "#fd79a8"
C_OT_EDGE = "#c0392b"
# ---------------------------------------------------------------------
# Helper builders
# ---------------------------------------------------------------------
def _add_node(nodes: List[Node], node_id: str, label: str, color: str, tooltip: str | None = None, size: int = 25):
"""Append Node only if id not yet present (agraph duplicates crash)."""
if any(n.id == node_id for n in nodes):
return
nodes.append(Node(id=node_id, label=label, color=color, size=size, tooltip=tooltip))
def _match(text: str, pattern: str) -> bool:
return bool(re.search(re.escape(pattern), text, flags=re.I))
# ---------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------
def build_agraph(
papers: List[Dict],
umls: List[Dict],
drug_safety: List[Dict],
genes: List[Dict] | None = None,
trials: List[Dict] | None = None,
ot_associations: List[Dict] | None = None,
):
"""Return (nodes, edges, config) for streamlit_agraph. Safe‑duplicates.
Parameters
----------
papers : PubMed / arXiv merged list (dicts with title & summary).
umls : List of UMLS concept dicts `{cui, name}`.
drug_safety : openFDA / DrugCentral outputs (mixed dict / list).
genes : Optional list with MyGene/NCBI dicts (symbol, name,...).
trials : Optional ClinicalTrials.gov v2 studies list.
ot_associations : Optional list from Open Targets.
"""
nodes: List[Node] = []
edges: List[Edge] = []
# 1️⃣ Concepts ----------------------------------------------------
for c in umls:
cui, name = c.get("cui"), c.get("name", "")
if cui and name:
cid = f"concept_{cui}"
_add_node(nodes, cid, name, C_CONCEPT)
# 2️⃣ Genes -------------------------------------------------------
genes = genes or []
for g in genes:
sym = g.get("symbol") or g.get("name")
gid = f"gene_{sym}"
tooltip = g.get("summary", "")
_add_node(nodes, gid, sym, C_GENE, tooltip=tooltip)
# 3️⃣ Drugs (normalize mixed structures) -------------------------
drug_tuples: List[Tuple[str, str]] = [] # (node_id, drug_name)
for i, dr in enumerate(drug_safety):
recs = dr if isinstance(dr, list) else [dr]
for j, rec in enumerate(recs):
name = (
rec.get("drug_name") or
rec.get("patient", {}).get("drug") or
rec.get("medicinalproduct") or
f"drug_{i}_{j}"
)
did = f"drug_{i}_{j}"
drug_tuples.append((did, name))
_add_node(nodes, did, name, C_DRUG)
# 4️⃣ Trials ------------------------------------------------------
trials = trials or []
for t in trials:
nct = t.get("nctId") or t.get("nctid")
if not nct:
continue
tid = f"trial_{nct}"
label = nct
tooltip = t.get("briefTitle") or "Clinical trial"
_add_node(nodes, tid, label, C_TRIAL, tooltip=tooltip, size=20)
# 5️⃣ Papers & mention edges -------------------------------------
for idx, p in enumerate(papers):
pid = f"paper_{idx}"
_add_node(nodes, pid, f"P{idx+1}", C_PAPER, tooltip=p.get("title", ""), size=15)
text_blob = f"{p.get('title','')} {p.get('summary','')}".lower()
# concept links
for c in umls:
if c.get("name") and _match(text_blob, c["name"]):
edges.append(Edge(source=pid, target=f"concept_{c['cui']}", label="mentions"))
# gene links
for g in genes:
if g.get("symbol") and _match(text_blob, g["symbol"]):
edges.append(Edge(source=pid, target=f"gene_{g['symbol']}", label="mentions"))
# drug links
for did, dname in drug_tuples:
if _match(text_blob, dname):
edges.append(Edge(source=pid, target=did, label="mentions"))
# 6️⃣ Open Targets edges (drug–gene / gene–disease) --------------
if ot_associations:
for row in ot_associations:
gsym = row.get("target", {}).get("symbol")
dis = row.get("disease", {}).get("name")
score = row.get("score", 0)
if gsym and dis:
gid = f"gene_{gsym}"
did = f"disease_{dis}"
_add_node(nodes, did, dis, C_CONCEPT, size=20)
edges.append(Edge(source=gid, target=did, color=C_OT_EDGE, label=f"OT {score:.2f}"))
# 7️⃣ Config ------------------------------------------------------
cfg = Config(
directed=False,
width="100%",
height="600",
nodeHighlightBehavior=True,
highlightColor="#f1c40f",
collapsible=True,
showLegend=False,
node={"labelProperty": "label"},
)
return nodes, edges, cfg
|