File size: 6,085 Bytes
633ba95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d1def9
f3dd8bc
633ba95
3d1def9
633ba95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d1def9
f3dd8bc
633ba95
 
a392df0
633ba95
a392df0
633ba95
f3dd8bc
633ba95
 
f3dd8bc
633ba95
 
 
 
 
 
 
f3dd8bc
633ba95
 
 
f3dd8bc
 
633ba95
 
 
 
 
 
f3dd8bc
633ba95
 
f3dd8bc
633ba95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3dd8bc
633ba95
a392df0
633ba95
 
 
 
 
 
 
 
 
f3dd8bc
 
633ba95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3dd8bc
633ba95
 
a392df0
633ba95
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env python3
"""MedGenesis – knowledge‑graph builder for Streamlit‑Agraph.

This version recognises **all new enrichment layers** introduced in the
latest orchestrator:
    • UMLS concepts                → green nodes
    • MyGene / NCBI gene hits      → purple nodes
    • openFDA / DrugCentral drugs  → orange nodes
    • ClinicalTrials.gov studies   → pink nodes
    • Open Targets associations    → red drug–gene / gene–disease edges
    • Literature papers            → blue nodes (tooltip = title)

The entry‑point `build_agraph` now receives a richer payload and returns
*(nodes, edges, config)* ready for `streamlit_agraph.agraph`.
"""
from __future__ import annotations

import re
from typing import List, Dict, Tuple

from streamlit_agraph import Node, Edge, Config

# ---------------------------------------------------------------------
# Colour palette (flat‑UI)
# ---------------------------------------------------------------------
C_PAPER   = "#0984e3"
C_CONCEPT = "#00b894"
C_GENE    = "#6c5ce7"
C_DRUG    = "#d35400"
C_TRIAL   = "#fd79a8"
C_OT_EDGE = "#c0392b"


# ---------------------------------------------------------------------
# Helper builders
# ---------------------------------------------------------------------

def _add_node(nodes: List[Node], node_id: str, label: str, color: str, tooltip: str | None = None, size: int = 25):
    """Append Node only if id not yet present (agraph duplicates crash)."""
    if any(n.id == node_id for n in nodes):
        return
    nodes.append(Node(id=node_id, label=label, color=color, size=size, tooltip=tooltip))


def _match(text: str, pattern: str) -> bool:
    return bool(re.search(re.escape(pattern), text, flags=re.I))


# ---------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------

def build_agraph(
    papers: List[Dict],
    umls: List[Dict],
    drug_safety: List[Dict],
    genes: List[Dict] | None = None,
    trials: List[Dict] | None = None,
    ot_associations: List[Dict] | None = None,
):
    """Return (nodes, edges, config) for streamlit_agraph. Safe‑duplicates.

    Parameters
    ----------
    papers          : PubMed / arXiv merged list (dicts with title & summary).
    umls            : List of UMLS concept dicts `{cui, name}`.
    drug_safety     : openFDA / DrugCentral outputs (mixed dict / list).
    genes           : Optional list with MyGene/NCBI dicts (symbol, name,...).
    trials          : Optional ClinicalTrials.gov v2 studies list.
    ot_associations : Optional list from Open Targets.
    """

    nodes: List[Node] = []
    edges: List[Edge] = []

    # 1️⃣ Concepts ----------------------------------------------------
    for c in umls:
        cui, name = c.get("cui"), c.get("name", "")
        if cui and name:
            cid = f"concept_{cui}"
            _add_node(nodes, cid, name, C_CONCEPT)

    # 2️⃣ Genes -------------------------------------------------------
    genes = genes or []
    for g in genes:
        sym = g.get("symbol") or g.get("name")
        gid = f"gene_{sym}"
        tooltip = g.get("summary", "")
        _add_node(nodes, gid, sym, C_GENE, tooltip=tooltip)

    # 3️⃣ Drugs (normalize mixed structures) -------------------------
    drug_tuples: List[Tuple[str, str]] = []  # (node_id, drug_name)
    for i, dr in enumerate(drug_safety):
        recs = dr if isinstance(dr, list) else [dr]
        for j, rec in enumerate(recs):
            name = (
                rec.get("drug_name") or
                rec.get("patient", {}).get("drug") or
                rec.get("medicinalproduct") or
                f"drug_{i}_{j}"
            )
            did = f"drug_{i}_{j}"
            drug_tuples.append((did, name))
            _add_node(nodes, did, name, C_DRUG)

    # 4️⃣ Trials ------------------------------------------------------
    trials = trials or []
    for t in trials:
        nct  = t.get("nctId") or t.get("nctid")
        if not nct:
            continue
        tid  = f"trial_{nct}"
        label = nct
        tooltip = t.get("briefTitle") or "Clinical trial"
        _add_node(nodes, tid, label, C_TRIAL, tooltip=tooltip, size=20)

    # 5️⃣ Papers & mention edges -------------------------------------
    for idx, p in enumerate(papers):
        pid = f"paper_{idx}"
        _add_node(nodes, pid, f"P{idx+1}", C_PAPER, tooltip=p.get("title", ""), size=15)

        text_blob = f"{p.get('title','')} {p.get('summary','')}".lower()

        # concept links
        for c in umls:
            if c.get("name") and _match(text_blob, c["name"]):
                edges.append(Edge(source=pid, target=f"concept_{c['cui']}", label="mentions"))
        # gene links
        for g in genes:
            if g.get("symbol") and _match(text_blob, g["symbol"]):
                edges.append(Edge(source=pid, target=f"gene_{g['symbol']}", label="mentions"))
        # drug links
        for did, dname in drug_tuples:
            if _match(text_blob, dname):
                edges.append(Edge(source=pid, target=did, label="mentions"))

    # 6️⃣ Open Targets edges (drug–gene / gene–disease) --------------
    if ot_associations:
        for row in ot_associations:
            gsym = row.get("target", {}).get("symbol")
            dis  = row.get("disease", {}).get("name")
            score = row.get("score", 0)
            if gsym and dis:
                gid = f"gene_{gsym}"
                did = f"disease_{dis}"
                _add_node(nodes, did, dis, C_CONCEPT, size=20)
                edges.append(Edge(source=gid, target=did, color=C_OT_EDGE, label=f"OT {score:.2f}"))

    # 7️⃣ Config ------------------------------------------------------
    cfg = Config(
        directed=False,
        width="100%",
        height="600",
        nodeHighlightBehavior=True,
        highlightColor="#f1c40f",
        collapsible=True,
        showLegend=False,
        node={"labelProperty": "label"},
    )

    return nodes, edges, cfg