File size: 7,090 Bytes
16d37fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt

# ─── 1. BENCHMARK DATA ──────────────────────────────────────────────────────────
# Nested dict: Domain β†’ { Model Name β†’ {metric_name: value, …, "SOTA_<metric>": value } }
benchmark_data = {
    "Protein Folding": {
        "Nexa Bio1 (Secondary)": {
            "Accuracy (%)": 71,
            "Q3 (%)": 65,
            "Q8 (%)": 55,
            "TM-score": 0.60,
            "SOTA_Accuracy (%)": 85,
            "SOTA_TM-score": 0.75
        },
        "Nexa Bio2 (Tertiary)": {
            "Confidence (%)": 90,
            "GDT_TS": 0.82,
            "Entropy Threshold (%)": 80,
            "SOTA_Confidence (%)": 92,
            "SOTA_GDT_TS": 0.85
        },
    },
    "Astrophysics": {
        "Nexa Astro": {
            "Accuracy (%)": 97,
            "Macro-F1 (%)": 96,
            "ROC-AUC": 0.98,
            "SOTA_Accuracy (%)": 96,
            "SOTA_ROC-AUC": 0.97
        },
    },
    "Materials Science": {
        "Nexa MatSci": {
            "MAE (eV)": 0.02,
            "RMSE (eV)": 0.03,
            "Bandgap Accuracy (%)": 98,
            "SOTA_MAE (eV)": 0.03,
            "SOTA_Bandgap Accuracy (%)": 95
        },
    },
    "Quantum State Tomography": {
        "Nexa QST": {
            "Fidelity": 0.80,
            "Purity": 1.00,
            "Trace Distance": 0.15,
            "SOTA_Fidelity": 0.83,
            "SOTA_Trace Distance": 0.12
        },
    },
    "Computational Fluid Dynamics": {
        "Nexa CFD": {
            "Relative L2 Error": 0.015,
            "Energy Conservation Loss": 0.005,
            "PSNR": 30,
            "SSIM": 0.88,
            "SOTA_Relative L2 Error": 0.020,
            "SOTA_SSIM": 0.85
        },
    },
    "High-Energy Physics": {
        "Nexa HEP": {
            "ROC-AUC": 0.92,
            "Event Accuracy (%)": 90,
            "Jet Tagging (%)": 88,
            "SOTA_ROC-AUC": 0.93,
            "SOTA_Event Accuracy (%)": 89
        },
    },
    "LLM Hypothesis & Methodology": {
        "Nexa MOE": {
            "Coherence (1–10)": 9.1,
            "Novelty (1–10)": 8.6,
            "Utility (1–10)": 8.8,
            "Expert-Rated SOTA (1–10)": 9.0
        },
    },
}

# ─── 2. SECTION DESCRIPTIONS ───────────────────────────────────────────────────
section_descriptions = {
    "Protein Folding": """**Protein Folding**  
Benchmarks for secondary (Q3/Q8) and tertiary (TM-score) structure prediction.  
Nexa Bio1 handles sequence→secondary, Nexa Bio2 handles full 3D fold confidence.""",
    "Astrophysics": """**Astrophysics**  
Stellar classification and redshift estimation.  
Metrics: Accuracy, F1, ROC-AUC against SDSS-Net and astroML baselines.""",
    "Materials Science": """**Materials Science**  
Property prediction for novel materials (e.g., bandgap, formation energy).  
Metrics: MAE/RMSE, bandgap‐prediction accuracy vs. CGCNN, ALIGNN.""",
    "Quantum State Tomography": """**Quantum State Tomography**  
Reconstruct quantum states from measurement data.  
Metrics: Fidelity, Purity, Trace Distance against PINNs and QuNet.""",
    "Computational Fluid Dynamics": """**CFD**  
Flow field prediction (Navier–Stokes).  
Metrics: Relative L2 Error, PSNR/SSIM, Energy Conservation Loss vs. FNO.""",
    "High-Energy Physics": """**High-Energy Physics**  
Particle classification and signal/background separation.  
Metrics: ROC-AUC, event reconstruction accuracy, jet-tagging efficiency.""",
    "LLM Hypothesis & Methodology": """**LLM-Based Scientific Reasoning**  
Hypothesis and methodology generation.  
Metrics scored 1–10 by expert rubric on Coherence, Novelty, and Utility; compared to top academic LLM baselines."""
}

# ─── 3. PLOTTING FUNCTION ────────────────────────────────────────────────────────
def plot_comparison(category):
    data = benchmark_data[category]
    fig, ax = plt.subplots(figsize=(7, 4))
    bar_width = 0.4
    indices = list(range(len(data)))
    labels = list(data.keys())

    # collect metrics that aren’t SOTA
    for i, model in enumerate(labels):
        metrics = data[model]
        # extract non-SOTA metrics
        non_sota = {k: v for k, v in metrics.items() if not k.startswith("SOTA")}
        sota = {k.replace("SOTA_", ""): v for k, v in metrics.items() if k.startswith("SOTA")}

        # bar positions
        pos = i * 2
        ax.bar([pos + j*bar_width for j in range(len(non_sota))],
               list(non_sota.values()),
               width=bar_width, label=f"{model} Metrics")
        if sota:
            ax.bar([pos + bar_width*len(non_sota) + j*bar_width for j in range(len(sota))],
                   list(sota.values()),
                   width=bar_width, alpha=0.7, label=f"{model} SOTA")

    # formatting
    ax.set_xticks([i * (2) + bar_width*(len(non_sota)/2) for i in indices])
    ax.set_xticklabels(labels, rotation=45, ha='right')
    ax.set_ylabel("Value / Score")
    ax.set_title(f"{category} β€” Nexa vs. SOTA")
    ax.legend(loc="upper right")
    plt.tight_layout()
    return fig

# ─── 4. CALLBACK TO RENDER SECTION ─────────────────────────────────────────────
def show_eval(category):
    desc = section_descriptions[category]
    df = pd.DataFrame(benchmark_data[category]).T
    fig = plot_comparison(category)
    return desc, df, fig

# ─── 5. BUILD GRADIO APP ───────────────────────────────────────────────────────
with gr.Blocks(css="""
    body { background-color: #f7f9fc; font-family: Arial, sans-serif; }
    .gradio-container { max-width: 900px; margin: auto; }
    h1, h2, h3 { color: #333; }
""") as app:
    gr.Markdown("# πŸ”¬ Nexa Evals Dashboard")
    gr.Markdown("A **comprehensive** SciML benchmark framework. Select a domain to view metrics, compare with SOTA, and explore detailed plots and tables.")

    with gr.Row():
        with gr.Column(scale=1):
            category = gr.Radio(
                choices=list(benchmark_data.keys()),
                value="Protein Folding",
                label="Select Domain / Model Group"
            )
        with gr.Column(scale=3):
            description = gr.Markdown("")
            table = gr.Dataframe(headers=["Metric", "Value"], interactive=False)
            plot = gr.Plot()

    category.change(
        fn=show_eval,
        inputs=category,
        outputs=[description, table, plot]
    )

    # initialize
    description.value, table.value, _ = show_eval("Protein Folding")

# Launch (on Hugging Face the config flags will be auto-managed)
app.launch()