#!/usr/bin/env python3 """MedGenesis – knowledge‑graph builder for Streamlit‑Agraph. This version recognises **all new enrichment layers** introduced in the latest orchestrator: • UMLS concepts → green nodes • MyGene / NCBI gene hits → purple nodes • openFDA / DrugCentral drugs → orange nodes • ClinicalTrials.gov studies → pink nodes • Open Targets associations → red drug–gene / gene–disease edges • Literature papers → blue nodes (tooltip = title) The entry‑point `build_agraph` now receives a richer payload and returns *(nodes, edges, config)* ready for `streamlit_agraph.agraph`. """ from __future__ import annotations import re from typing import List, Dict, Tuple from streamlit_agraph import Node, Edge, Config # --------------------------------------------------------------------- # Colour palette (flat‑UI) # --------------------------------------------------------------------- C_PAPER = "#0984e3" C_CONCEPT = "#00b894" C_GENE = "#6c5ce7" C_DRUG = "#d35400" C_TRIAL = "#fd79a8" C_OT_EDGE = "#c0392b" # --------------------------------------------------------------------- # Helper builders # --------------------------------------------------------------------- def _add_node(nodes: List[Node], node_id: str, label: str, color: str, tooltip: str | None = None, size: int = 25): """Append Node only if id not yet present (agraph duplicates crash).""" if any(n.id == node_id for n in nodes): return nodes.append(Node(id=node_id, label=label, color=color, size=size, tooltip=tooltip)) def _match(text: str, pattern: str) -> bool: return bool(re.search(re.escape(pattern), text, flags=re.I)) # --------------------------------------------------------------------- # Public API # --------------------------------------------------------------------- def build_agraph( papers: List[Dict], umls: List[Dict], drug_safety: List[Dict], genes: List[Dict] | None = None, trials: List[Dict] | None = None, ot_associations: List[Dict] | None = None, ): """Return (nodes, edges, config) for streamlit_agraph. Safe‑duplicates. Parameters ---------- papers : PubMed / arXiv merged list (dicts with title & summary). umls : List of UMLS concept dicts `{cui, name}`. drug_safety : openFDA / DrugCentral outputs (mixed dict / list). genes : Optional list with MyGene/NCBI dicts (symbol, name,...). trials : Optional ClinicalTrials.gov v2 studies list. ot_associations : Optional list from Open Targets. """ nodes: List[Node] = [] edges: List[Edge] = [] # 1️⃣ Concepts ---------------------------------------------------- for c in umls: cui, name = c.get("cui"), c.get("name", "") if cui and name: cid = f"concept_{cui}" _add_node(nodes, cid, name, C_CONCEPT) # 2️⃣ Genes ------------------------------------------------------- genes = genes or [] for g in genes: sym = g.get("symbol") or g.get("name") gid = f"gene_{sym}" tooltip = g.get("summary", "") _add_node(nodes, gid, sym, C_GENE, tooltip=tooltip) # 3️⃣ Drugs (normalize mixed structures) ------------------------- drug_tuples: List[Tuple[str, str]] = [] # (node_id, drug_name) for i, dr in enumerate(drug_safety): recs = dr if isinstance(dr, list) else [dr] for j, rec in enumerate(recs): name = ( rec.get("drug_name") or rec.get("patient", {}).get("drug") or rec.get("medicinalproduct") or f"drug_{i}_{j}" ) did = f"drug_{i}_{j}" drug_tuples.append((did, name)) _add_node(nodes, did, name, C_DRUG) # 4️⃣ Trials ------------------------------------------------------ trials = trials or [] for t in trials: nct = t.get("nctId") or t.get("nctid") if not nct: continue tid = f"trial_{nct}" label = nct tooltip = t.get("briefTitle") or "Clinical trial" _add_node(nodes, tid, label, C_TRIAL, tooltip=tooltip, size=20) # 5️⃣ Papers & mention edges ------------------------------------- for idx, p in enumerate(papers): pid = f"paper_{idx}" _add_node(nodes, pid, f"P{idx+1}", C_PAPER, tooltip=p.get("title", ""), size=15) text_blob = f"{p.get('title','')} {p.get('summary','')}".lower() # concept links for c in umls: if c.get("name") and _match(text_blob, c["name"]): edges.append(Edge(source=pid, target=f"concept_{c['cui']}", label="mentions")) # gene links for g in genes: if g.get("symbol") and _match(text_blob, g["symbol"]): edges.append(Edge(source=pid, target=f"gene_{g['symbol']}", label="mentions")) # drug links for did, dname in drug_tuples: if _match(text_blob, dname): edges.append(Edge(source=pid, target=did, label="mentions")) # 6️⃣ Open Targets edges (drug–gene / gene–disease) -------------- if ot_associations: for row in ot_associations: gsym = row.get("target", {}).get("symbol") dis = row.get("disease", {}).get("name") score = row.get("score", 0) if gsym and dis: gid = f"gene_{gsym}" did = f"disease_{dis}" _add_node(nodes, did, dis, C_CONCEPT, size=20) edges.append(Edge(source=gid, target=did, color=C_OT_EDGE, label=f"OT {score:.2f}")) # 7️⃣ Config ------------------------------------------------------ cfg = Config( directed=False, width="100%", height="600", nodeHighlightBehavior=True, highlightColor="#f1c40f", collapsible=True, showLegend=False, node={"labelProperty": "label"}, ) return nodes, edges, cfg