Spaces:
Running
Running
Aaron Mueller
commited on
Commit
Β·
5ed4bca
1
Parent(s):
3b802b7
update HF url handling
Browse files- app.py +67 -43
- src/about.py +40 -12
- src/display/utils.py +2 -3
- src/leaderboard/read_evals.py +27 -11
- src/populate.py +10 -7
- src/submission/check_validity.py +10 -6
- src/submission/submit.py +9 -7
app.py
CHANGED
@@ -17,7 +17,7 @@ from copy import deepcopy
|
|
17 |
from src.about import (
|
18 |
CITATION_BUTTON_LABEL,
|
19 |
CITATION_BUTTON_TEXT,
|
20 |
-
|
21 |
INTRODUCTION_TEXT,
|
22 |
LLM_BENCHMARKS_TEXT,
|
23 |
TITLE,
|
@@ -38,7 +38,7 @@ from src.display.utils import (
|
|
38 |
from src.envs import API, EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH, QUEUE_REPO_SUBGRAPH, QUEUE_REPO_CAUSALGRAPH, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
|
39 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df_mib_subgraph, get_leaderboard_df_mib_causalgraph
|
40 |
from src.submission.submit import upload_to_queue, remove_submission
|
41 |
-
from src.submission.check_validity import verify_circuit_submission, verify_causal_variable_submission, check_rate_limit
|
42 |
|
43 |
from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
|
44 |
|
@@ -288,7 +288,7 @@ def _sigmoid(x):
|
|
288 |
|
289 |
LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
|
290 |
LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
|
291 |
-
metric_type="
|
292 |
|
293 |
# LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
|
294 |
# In app.py, modify the LEADERBOARD initialization
|
@@ -300,14 +300,15 @@ LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGAT
|
|
300 |
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
301 |
# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
|
302 |
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
308 |
-
|
309 |
-
|
310 |
|
|
|
|
|
|
|
|
|
311 |
|
312 |
|
313 |
def init_leaderboard_mib_subgraph(dataframe, track):
|
@@ -577,7 +578,7 @@ with demo:
|
|
577 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
578 |
with gr.TabItem("Circuit Localization", elem_id="subgraph", id=0):
|
579 |
with gr.Tabs() as subgraph_tabs:
|
580 |
-
with gr.TabItem("
|
581 |
# Add description for filters
|
582 |
gr.Markdown("""
|
583 |
### Filtering Options
|
@@ -610,7 +611,7 @@ with demo:
|
|
610 |
outputs=leaderboard
|
611 |
)
|
612 |
print(f"Leaderboard is {leaderboard}")
|
613 |
-
with gr.TabItem("
|
614 |
# Add description for filters
|
615 |
gr.Markdown("""
|
616 |
### Filtering Options
|
@@ -690,9 +691,7 @@ with demo:
|
|
690 |
"Causal Graph"
|
691 |
)
|
692 |
|
693 |
-
with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
|
694 |
-
gr.Markdown("## π Submission Portal")
|
695 |
-
|
696 |
# Track selection
|
697 |
track = gr.Radio(
|
698 |
choices=[
|
@@ -704,28 +703,30 @@ with demo:
|
|
704 |
)
|
705 |
|
706 |
with gr.Group(visible=False) as circuit_ui:
|
707 |
-
gr.
|
708 |
-
|
709 |
-
|
710 |
-
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
-
|
724 |
-
"
|
725 |
-
|
|
|
|
|
726 |
|
727 |
with gr.Group(visible=False) as causal_ui:
|
728 |
-
gr.Markdown(
|
729 |
with gr.Row():
|
730 |
layer = gr.Number(
|
731 |
label="Layer Number",
|
@@ -743,9 +744,7 @@ with demo:
|
|
743 |
hf_repo_cg = gr.Textbox(
|
744 |
label="HuggingFace Repository URL",
|
745 |
placeholder="https://huggingface.co/username/repo/path",
|
746 |
-
info="Must be a valid HuggingFace URL pointing to a file containing the trained featurizer (.pt). "
|
747 |
-
"Remove 'tree', 'resolve', and the branch name (e.g., '/tree/main/') from URL if present."
|
748 |
-
)
|
749 |
code_upload = gr.File(
|
750 |
label="Upload Python file implementing your featurization function",
|
751 |
file_types=[".py"],
|
@@ -791,12 +790,12 @@ with demo:
|
|
791 |
errors.append(f"Invalid HuggingFace URL - must start with https://huggingface.co/")
|
792 |
breaking_error = True
|
793 |
else:
|
794 |
-
|
795 |
-
if
|
796 |
errors.append("Could not read username or repo name from HF URL")
|
797 |
breaking_error = True
|
798 |
else:
|
799 |
-
user_name, repo_name =
|
800 |
under_rate_limit, time_left = check_rate_limit(track, user_name, contact_email)
|
801 |
if not under_rate_limit:
|
802 |
errors.append(f"Rate limit exceeded (max 2 submissions per week). Please try again in {time_left}. " \
|
@@ -841,8 +840,8 @@ with demo:
|
|
841 |
with warning_modal:
|
842 |
gr.Markdown("### β οΈ Submission Warnings")
|
843 |
warning_display = gr.Markdown()
|
844 |
-
proceed_btn = gr.Button("Proceed Anyway", variant="
|
845 |
-
cancel_btn = gr.Button("Cancel Submission", variant="
|
846 |
|
847 |
# Store submission data between callbacks
|
848 |
pending_submission = gr.State()
|
@@ -865,6 +864,31 @@ with demo:
|
|
865 |
outputs=[status, warning_display, pending_submission, warning_modal]
|
866 |
)
|
867 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
868 |
with gr.Group():
|
869 |
gr.Markdown("### Remove Submission from Queue")
|
870 |
with gr.Row():
|
|
|
17 |
from src.about import (
|
18 |
CITATION_BUTTON_LABEL,
|
19 |
CITATION_BUTTON_TEXT,
|
20 |
+
EVALUATION_QUEUE_TEXT_SUBGRAPH, EVALUATION_QUEUE_TEXT_CAUSALVARIABLE,
|
21 |
INTRODUCTION_TEXT,
|
22 |
LLM_BENCHMARKS_TEXT,
|
23 |
TITLE,
|
|
|
38 |
from src.envs import API, EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH, QUEUE_REPO_SUBGRAPH, QUEUE_REPO_CAUSALGRAPH, REPO_ID, TOKEN, RESULTS_REPO_MIB_SUBGRAPH, EVAL_RESULTS_MIB_SUBGRAPH_PATH, RESULTS_REPO_MIB_CAUSALGRAPH, EVAL_RESULTS_MIB_CAUSALGRAPH_PATH
|
39 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df_mib_subgraph, get_leaderboard_df_mib_causalgraph
|
40 |
from src.submission.submit import upload_to_queue, remove_submission
|
41 |
+
from src.submission.check_validity import verify_circuit_submission, verify_causal_variable_submission, check_rate_limit, parse_huggingface_url
|
42 |
|
43 |
from src.about import TasksMib_Subgraph, TasksMib_Causalgraph
|
44 |
|
|
|
288 |
|
289 |
LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
|
290 |
LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
|
291 |
+
metric_type="CMD")
|
292 |
|
293 |
# LEADERBOARD_DF_MIB_CAUSALGRAPH = get_leaderboard_df_mib_causalgraph(EVAL_RESULTS_MIB_CAUSALGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_CAUSALGRAPH, BENCHMARK_COLS_MIB_CAUSALGRAPH)
|
294 |
# In app.py, modify the LEADERBOARD initialization
|
|
|
300 |
# LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
301 |
# LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)
|
302 |
|
303 |
+
(
|
304 |
+
finished_eval_queue_df_subgraph,
|
305 |
+
pending_eval_queue_df_subgraph,
|
306 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_SUBGRAPH, EVAL_COLS)
|
|
|
|
|
|
|
307 |
|
308 |
+
# (
|
309 |
+
# finished_eval_queue_df_causalvariable,
|
310 |
+
# pending_eval_queue_df_causalvariable,
|
311 |
+
# ) = get_evaluation_queue_df(EVAL_REQUESTS_CAUSALGRAPH, EVAL_COLS)
|
312 |
|
313 |
|
314 |
def init_leaderboard_mib_subgraph(dataframe, track):
|
|
|
578 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
579 |
with gr.TabItem("Circuit Localization", elem_id="subgraph", id=0):
|
580 |
with gr.Tabs() as subgraph_tabs:
|
581 |
+
with gr.TabItem("CPR", id=0):
|
582 |
# Add description for filters
|
583 |
gr.Markdown("""
|
584 |
### Filtering Options
|
|
|
611 |
outputs=leaderboard
|
612 |
)
|
613 |
print(f"Leaderboard is {leaderboard}")
|
614 |
+
with gr.TabItem("CMD", id=1):
|
615 |
# Add description for filters
|
616 |
gr.Markdown("""
|
617 |
### Filtering Options
|
|
|
691 |
"Causal Graph"
|
692 |
)
|
693 |
|
694 |
+
with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
|
|
|
|
|
695 |
# Track selection
|
696 |
track = gr.Radio(
|
697 |
choices=[
|
|
|
703 |
)
|
704 |
|
705 |
with gr.Group(visible=False) as circuit_ui:
|
706 |
+
with gr.Column():
|
707 |
+
with gr.Row():
|
708 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT_SUBGRAPH, elem_classes="markdown-text")
|
709 |
+
|
710 |
+
with gr.Row():
|
711 |
+
hf_repo_circ = gr.Textbox(
|
712 |
+
label="HuggingFace Repository URL",
|
713 |
+
placeholder="https://huggingface.co/username/repo/path",
|
714 |
+
info="Must be a valid HuggingFace URL pointing to folders containing either 1 importance score file per task/model, or " \
|
715 |
+
"9 circuit files per task/model (.json or .pt)."
|
716 |
+
)
|
717 |
+
level = gr.Radio(
|
718 |
+
choices=[
|
719 |
+
"Edge",
|
720 |
+
"Node (submodule)",
|
721 |
+
"Node (neuron)"
|
722 |
+
],
|
723 |
+
label="Level of granularity",
|
724 |
+
info="Is your circuit defined by its inclusion/exclusion of certain edges (e.g., MLP1 to H10L12), of certain submodules (e.g., MLP1), or of neurons " \
|
725 |
+
"within those submodules (e.g., MLP1 neuron 295)?"
|
726 |
+
)
|
727 |
|
728 |
with gr.Group(visible=False) as causal_ui:
|
729 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT_CAUSALVARIABLE, elem_classes="markdown-text")
|
730 |
with gr.Row():
|
731 |
layer = gr.Number(
|
732 |
label="Layer Number",
|
|
|
744 |
hf_repo_cg = gr.Textbox(
|
745 |
label="HuggingFace Repository URL",
|
746 |
placeholder="https://huggingface.co/username/repo/path",
|
747 |
+
info="Must be a valid HuggingFace URL pointing to a file containing the trained featurizer (.pt). " )
|
|
|
|
|
748 |
code_upload = gr.File(
|
749 |
label="Upload Python file implementing your featurization function",
|
750 |
file_types=[".py"],
|
|
|
790 |
errors.append(f"Invalid HuggingFace URL - must start with https://huggingface.co/")
|
791 |
breaking_error = True
|
792 |
else:
|
793 |
+
repo_id, subfolder, revision = parse_huggingface_url(hf_repo)
|
794 |
+
if repo_id is None:
|
795 |
errors.append("Could not read username or repo name from HF URL")
|
796 |
breaking_error = True
|
797 |
else:
|
798 |
+
user_name, repo_name = repo_id.split("/")
|
799 |
under_rate_limit, time_left = check_rate_limit(track, user_name, contact_email)
|
800 |
if not under_rate_limit:
|
801 |
errors.append(f"Rate limit exceeded (max 2 submissions per week). Please try again in {time_left}. " \
|
|
|
840 |
with warning_modal:
|
841 |
gr.Markdown("### β οΈ Submission Warnings")
|
842 |
warning_display = gr.Markdown()
|
843 |
+
proceed_btn = gr.Button("Proceed Anyway", variant="secondary")
|
844 |
+
cancel_btn = gr.Button("Cancel Submission", variant="primary")
|
845 |
|
846 |
# Store submission data between callbacks
|
847 |
pending_submission = gr.State()
|
|
|
864 |
outputs=[status, warning_display, pending_submission, warning_modal]
|
865 |
)
|
866 |
|
867 |
+
with gr.Column():
|
868 |
+
with gr.Accordion(
|
869 |
+
f"β
Finished Evaluations ({len(finished_eval_queue_df_subgraph)})",
|
870 |
+
open=False,
|
871 |
+
):
|
872 |
+
with gr.Row():
|
873 |
+
finished_eval_table = gr.components.Dataframe(
|
874 |
+
value=finished_eval_queue_df_subgraph,
|
875 |
+
headers=EVAL_COLS,
|
876 |
+
datatype=EVAL_TYPES,
|
877 |
+
row_count=5,
|
878 |
+
)
|
879 |
+
|
880 |
+
with gr.Accordion(
|
881 |
+
f"β³ Pending Evaluation Queue ({len(pending_eval_queue_df_subgraph)})",
|
882 |
+
open=False,
|
883 |
+
):
|
884 |
+
with gr.Row():
|
885 |
+
pending_eval_table = gr.components.Dataframe(
|
886 |
+
value=pending_eval_queue_df_subgraph,
|
887 |
+
headers=EVAL_COLS,
|
888 |
+
datatype=EVAL_TYPES,
|
889 |
+
row_count=5,
|
890 |
+
)
|
891 |
+
|
892 |
with gr.Group():
|
893 |
gr.Markdown("### Remove Submission from Queue")
|
894 |
with gr.Row():
|
src/about.py
CHANGED
@@ -105,7 +105,7 @@ TITLE = """<h1 align="center" id="space-title"> Mechanistic Interpretability Ben
|
|
105 |
|
106 |
# What does your leaderboard evaluate?
|
107 |
INTRODUCTION_TEXT = """
|
108 |
-
The leaderboards for each track of the
|
109 |
"""
|
110 |
|
111 |
# Which evaluations are you running? how can people reproduce what you have?
|
@@ -113,24 +113,52 @@ LLM_BENCHMARKS_TEXT = f"""
|
|
113 |
This leaderboard displays scores on the private test set for the Mechanistic Interpretability Benchmark. Each track has its own tab.
|
114 |
"""
|
115 |
|
116 |
-
|
117 |
-
## Circuit localization track
|
118 |
|
|
|
119 |
You'll need either (i) 1 circuit per task/model combinaton with floating-point importance scores for each edge or node, or (ii) 9 circuits per model/task with binary membership scores for each edge or node.
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
-
|
123 |
-
|
124 |
-
If you provide more circuits than needed, our evaluation script will take the first 9 lexicographically.
|
125 |
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
-
|
|
|
|
|
|
|
129 |
|
130 |
-
|
|
|
|
|
|
|
131 |
|
132 |
-
|
133 |
-
|
134 |
"""
|
135 |
|
136 |
CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the MIB paper, as well as the author(s) of the method(s) whose results you cite!"
|
|
|
105 |
|
106 |
# What does your leaderboard evaluate?
|
107 |
INTRODUCTION_TEXT = """
|
108 |
+
The leaderboards for each track of the Mechanistic Interpretability Benchmark.
|
109 |
"""
|
110 |
|
111 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
113 |
This leaderboard displays scores on the private test set for the Mechanistic Interpretability Benchmark. Each track has its own tab.
|
114 |
"""
|
115 |
|
116 |
+
EVALUATION_QUEUE_TEXT_SUBGRAPH = """
|
117 |
+
## Circuit localization track
|
118 |
|
119 |
+
### 1. Collect your circuits
|
120 |
You'll need either (i) 1 circuit per task/model combinaton with floating-point importance scores for each edge or node, or (ii) 9 circuits per model/task with binary membership scores for each edge or node.
|
121 |
+
For specifications about the file formats we accept, see the README on [our project GitHub](https://github.com/hannamw/MIB-subgraph-track).
|
122 |
+
|
123 |
+
### 2. Upload your circuits
|
124 |
+
Create a HuggingFace repository, and create a folder in that repository that will hold all of your circuit folders.
|
125 |
+
At the URL you provide, there should be one folder per task/model combination; these folders
|
126 |
+
should contain your circuit(s). As long as the folder names contain the model and task names, you do not need to worry about the circuit filenames.
|
127 |
+
If you provide more circuits than needed, our evaluation script will take the first 9 lexicographically in a given folder. We provide examples of valid
|
128 |
+
submissions: see [here](https://huggingface.co/mib-bench/mib-circuits-example/tree/main/importances/json) for a submission using importance scores and
|
129 |
+
[here](https://huggingface.co/mib-bench/mib-circuits-example/tree/main/multiple_circuits/pt) for a submission uploading multiple circuits.
|
130 |
+
|
131 |
+
### 3. Manage your submission in the queue
|
132 |
+
If your submission passes all checks, it will be added to the queue. You will receive a submission ID here when you do this; be sure to save it!
|
133 |
+
This will allow you to remove your submission from the queue (e.g., if you find a bug in your circuits). This will prevent you from needing to wait until
|
134 |
+
next week to resubmit.
|
135 |
+
|
136 |
+
Before your submission has been validated by our backend, it will have the "PREVALIDATION" status in the queue. Once it has been validated, it will have the "PENDING" status.
|
137 |
+
It will keep the PENDING status until it has been run on the private test set.
|
138 |
+
"""
|
139 |
|
140 |
+
EVALUATION_QUEUE_TEXT_CAUSALVARIABLE = """
|
141 |
+
## Causal variable localization track
|
|
|
142 |
|
143 |
+
### 1. Collect your materials
|
144 |
+
You'll need the following:
|
145 |
+
* A trained featurizer saved as a .pt object.
|
146 |
+
* A python function that can load and run forward passes with your featurizer.
|
147 |
+
* A dynamic token alignment function.
|
148 |
+
* A hypothesized feature location.
|
149 |
|
150 |
+
### 2. Upload your materials
|
151 |
+
Create a HuggingFace repository, and create a folder in that repository that will hold all of your materials.
|
152 |
+
At the URL you provide, each of the above materials should be present. We will take the first python script lexicographically
|
153 |
+
as your featurizer function, and the first .pt file lexicographically as your featurizer.
|
154 |
|
155 |
+
### 3. Manage your submission in the queue
|
156 |
+
If your submission passes all checks, it will be added to the queue. You will receive a submission ID here when you do this; be sure to save it!
|
157 |
+
This will allow you to remove your submission from the queue (e.g., if you find a bug in your circuits). This will prevent you from needing to wait until
|
158 |
+
next week to resubmit.
|
159 |
|
160 |
+
Before your submission has been validated by our backend, it will have the "PREVALIDATION" status in the queue. Once it has been validated, it will have the "PENDING" status.
|
161 |
+
It will keep the PENDING status until it has been run on the private test set.
|
162 |
"""
|
163 |
|
164 |
CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the MIB paper, as well as the author(s) of the method(s) whose results you cite!"
|
src/display/utils.py
CHANGED
@@ -192,10 +192,9 @@ AutoEvalColumn_mib_causalgraph = make_dataclass(
|
|
192 |
## For the queue columns in the submission tab
|
193 |
@dataclass(frozen=True)
|
194 |
class EvalQueueColumn: # Queue column
|
195 |
-
|
196 |
-
|
197 |
revision = ColumnContent("revision", "str", True)
|
198 |
-
private = ColumnContent("private", "bool", True)
|
199 |
status = ColumnContent("status", "str", True)
|
200 |
|
201 |
## All the model information that we might need
|
|
|
192 |
## For the queue columns in the submission tab
|
193 |
@dataclass(frozen=True)
|
194 |
class EvalQueueColumn: # Queue column
|
195 |
+
method_name = ColumnContent("method_name", "str", True)
|
196 |
+
repo_id = ColumnContent("hf_repo", "markdown", True)
|
197 |
revision = ColumnContent("revision", "str", True)
|
|
|
198 |
status = ColumnContent("status", "str", True)
|
199 |
|
200 |
## All the model information that we might need
|
src/leaderboard/read_evals.py
CHANGED
@@ -18,11 +18,11 @@ import pandas as pd
|
|
18 |
|
19 |
|
20 |
|
21 |
-
def compute_area(edge_counts, faithfulnesses
|
22 |
# Return None if either list is empty
|
23 |
if not edge_counts or not faithfulnesses:
|
24 |
return None, None, None
|
25 |
-
|
26 |
percentages = [e / max(edge_counts) for e in edge_counts]
|
27 |
area_under = 0.
|
28 |
area_from_100 = 0.
|
@@ -72,13 +72,23 @@ class EvalResult_MIB_SUBGRAPH:
|
|
72 |
# Keep exact scores structure from JSON
|
73 |
scores = model_result.get("scores", {})
|
74 |
|
75 |
-
# for task in ["ioi", "mcqa", "arithmetic_addition", "arithmetic_subtraction", "arc_easy", "arc_challenge"]:
|
76 |
for task in TasksMib_Subgraph.get_all_tasks():
|
77 |
if task in scores:
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
return EvalResult_MIB_SUBGRAPH(
|
84 |
eval_name=method_name,
|
@@ -100,7 +110,7 @@ class EvalResult_MIB_SUBGRAPH:
|
|
100 |
df_transformed.loc[i] = row.apply(lambda x: self._sigmoid(x) if isinstance(x, (float, int)) else x)
|
101 |
return df_transformed
|
102 |
|
103 |
-
def to_dict(self, metric_type="
|
104 |
"""Converts the Eval Result to a dict for dataframe display"""
|
105 |
data_dict = {
|
106 |
"eval_name": self.eval_name,
|
@@ -122,9 +132,15 @@ class EvalResult_MIB_SUBGRAPH:
|
|
122 |
for model, metrics in task_results.items():
|
123 |
col_name = f"{task}_{model}"
|
124 |
|
125 |
-
if not metrics
|
126 |
continue
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
faithfulness = metrics["faithfulness"]
|
129 |
if isinstance(faithfulness[0], list):
|
130 |
faithfulness = faithfulness[0]
|
@@ -134,7 +150,7 @@ class EvalResult_MIB_SUBGRAPH:
|
|
134 |
continue
|
135 |
|
136 |
area_under, area_from_100, _ = result
|
137 |
-
score = area_under if metric_type == "
|
138 |
data_dict[col_name] = round(score, 2)
|
139 |
all_scores.append(score)
|
140 |
transformed_scores.append(self._sigmoid(score))
|
|
|
18 |
|
19 |
|
20 |
|
21 |
+
def compute_area(edge_counts, faithfulnesses):
|
22 |
# Return None if either list is empty
|
23 |
if not edge_counts or not faithfulnesses:
|
24 |
return None, None, None
|
25 |
+
|
26 |
percentages = [e / max(edge_counts) for e in edge_counts]
|
27 |
area_under = 0.
|
28 |
area_from_100 = 0.
|
|
|
72 |
# Keep exact scores structure from JSON
|
73 |
scores = model_result.get("scores", {})
|
74 |
|
|
|
75 |
for task in TasksMib_Subgraph.get_all_tasks():
|
76 |
if task in scores:
|
77 |
+
if "CPR" in scores[task]:
|
78 |
+
results[task][model_name] = {"CPR": {}, "CMD": {}}
|
79 |
+
results[task][model_name]["CPR"] = {
|
80 |
+
"edge_counts": scores[task]["CPR"]["edge_counts"],
|
81 |
+
"faithfulness": scores[task]["CPR"]["faithfulness"]
|
82 |
+
}
|
83 |
+
results[task][model_name]["CMD"] = {
|
84 |
+
"edge_counts": scores[task]["CMD"]["edge_counts"],
|
85 |
+
"faithfulness": scores[task]["CMD"]["faithfulness"]
|
86 |
+
}
|
87 |
+
else:
|
88 |
+
results[task][model_name] = {
|
89 |
+
"edge_counts": scores[task]["edge_counts"],
|
90 |
+
"faithfulness": scores[task]["faithfulness"]
|
91 |
+
}
|
92 |
|
93 |
return EvalResult_MIB_SUBGRAPH(
|
94 |
eval_name=method_name,
|
|
|
110 |
df_transformed.loc[i] = row.apply(lambda x: self._sigmoid(x) if isinstance(x, (float, int)) else x)
|
111 |
return df_transformed
|
112 |
|
113 |
+
def to_dict(self, metric_type="CPR"):
|
114 |
"""Converts the Eval Result to a dict for dataframe display"""
|
115 |
data_dict = {
|
116 |
"eval_name": self.eval_name,
|
|
|
132 |
for model, metrics in task_results.items():
|
133 |
col_name = f"{task}_{model}"
|
134 |
|
135 |
+
if not metrics:
|
136 |
continue
|
137 |
+
|
138 |
+
if not metrics[metric_type] and (not metrics["edge_counts"] or not metrics["faithfulness"]):
|
139 |
+
continue
|
140 |
+
|
141 |
+
if metric_type in metrics:
|
142 |
+
metrics = metrics[metric_type]
|
143 |
+
|
144 |
faithfulness = metrics["faithfulness"]
|
145 |
if isinstance(faithfulness[0], list):
|
146 |
faithfulness = faithfulness[0]
|
|
|
150 |
continue
|
151 |
|
152 |
area_under, area_from_100, _ = result
|
153 |
+
score = area_under if metric_type == "CPR" else area_from_100
|
154 |
data_dict[col_name] = round(score, 2)
|
155 |
all_scores.append(score)
|
156 |
transformed_scores.append(self._sigmoid(score))
|
src/populate.py
CHANGED
@@ -7,9 +7,10 @@ from src.display.formatting import has_no_nan_values, make_clickable_model
|
|
7 |
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
|
9 |
from src.about import TasksMib_Causalgraph
|
|
|
10 |
|
11 |
def get_leaderboard_df_mib_subgraph(results_path: str, cols: list, benchmark_cols: list,
|
12 |
-
metric_type = "
|
13 |
"""Creates a dataframe from all the MIB experiment results"""
|
14 |
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
15 |
raw_data = get_raw_eval_results_mib_subgraph(results_path)
|
@@ -19,7 +20,7 @@ def get_leaderboard_df_mib_subgraph(results_path: str, cols: list, benchmark_col
|
|
19 |
|
20 |
# Convert to dataframe
|
21 |
df = pd.DataFrame.from_records(all_data_json)
|
22 |
-
ascending = False if metric_type == "
|
23 |
|
24 |
# Sort by Average score descending
|
25 |
if 'Average' in df.columns:
|
@@ -170,10 +171,12 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
170 |
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
171 |
# all_evals.append(data)
|
172 |
|
173 |
-
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "
|
174 |
-
|
175 |
-
|
|
|
|
|
|
|
176 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
177 |
-
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
178 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
179 |
-
return df_finished[cols],
|
|
|
7 |
from src.display.utils import AutoEvalColumn, AutoEvalColumnMultimodal, EvalQueueColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results, get_raw_eval_results_mib_subgraph, get_raw_eval_results_mib_causalgraph
|
9 |
from src.about import TasksMib_Causalgraph
|
10 |
+
from src.submission.check_validity import parse_huggingface_url
|
11 |
|
12 |
def get_leaderboard_df_mib_subgraph(results_path: str, cols: list, benchmark_cols: list,
|
13 |
+
metric_type = "CPR") -> pd.DataFrame:
|
14 |
"""Creates a dataframe from all the MIB experiment results"""
|
15 |
# print(f"results_path is {results_path}, requests_path is {requests_path}")
|
16 |
raw_data = get_raw_eval_results_mib_subgraph(results_path)
|
|
|
20 |
|
21 |
# Convert to dataframe
|
22 |
df = pd.DataFrame.from_records(all_data_json)
|
23 |
+
ascending = False if metric_type == "CPR" else True
|
24 |
|
25 |
# Sort by Average score descending
|
26 |
if 'Average' in df.columns:
|
|
|
171 |
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
172 |
# all_evals.append(data)
|
173 |
|
174 |
+
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "PREVALIDATION"]]
|
175 |
+
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL" or e["status"] == "FAILED"]
|
176 |
+
for list in (pending_list, finished_list):
|
177 |
+
for item in list:
|
178 |
+
item["track"] = "Circuit Localization"
|
179 |
+
item["hf_repo"] = parse_huggingface_url(item["hf_repo"])[0]
|
180 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
|
|
181 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
182 |
+
return df_finished[cols], df_pending[cols]
|
src/submission/check_validity.py
CHANGED
@@ -238,23 +238,26 @@ def parse_huggingface_url(url: str):
|
|
238 |
|
239 |
parsed = urlparse(url)
|
240 |
path_parts = parsed.path.strip("/").split("/")
|
|
|
241 |
|
242 |
# Extract repo_id (username/repo_name)
|
243 |
if len(path_parts) < 2:
|
244 |
-
|
245 |
-
|
|
|
246 |
|
247 |
# Extract folder path (if in /tree/ or /blob/)
|
248 |
if "tree" in path_parts or "blob" in path_parts:
|
249 |
try:
|
250 |
branch_idx = path_parts.index("tree") if "tree" in path_parts else path_parts.index("blob")
|
251 |
folder_path = "/".join(path_parts[branch_idx + 2:]) # Skip "tree/main" or "blob/main"
|
|
|
252 |
except (ValueError, IndexError):
|
253 |
folder_path = None
|
254 |
else:
|
255 |
folder_path = None
|
256 |
|
257 |
-
return repo_id, folder_path
|
258 |
|
259 |
|
260 |
def validate_directory(fs: HfFileSystem, repo_id: str, dirname: str, curr_tm: str, circuit_level:Literal['edge', 'node','neuron']='edge'):
|
@@ -318,10 +321,11 @@ def verify_circuit_submission(hf_repo, level, progress=gr.Progress()):
|
|
318 |
path = hf_repo
|
319 |
level = level
|
320 |
|
321 |
-
folder_path =
|
322 |
-
|
|
|
323 |
try:
|
324 |
-
files = fs.listdir(folder_path)
|
325 |
except Exception as e:
|
326 |
errors.append(f"Could not open Huggingface URL: {e}")
|
327 |
return errors, warnings
|
|
|
238 |
|
239 |
parsed = urlparse(url)
|
240 |
path_parts = parsed.path.strip("/").split("/")
|
241 |
+
revision = "main"
|
242 |
|
243 |
# Extract repo_id (username/repo_name)
|
244 |
if len(path_parts) < 2:
|
245 |
+
return None, None, None # Can't extract repo_id
|
246 |
+
else:
|
247 |
+
repo_id = f"{path_parts[0]}/{path_parts[1]}"
|
248 |
|
249 |
# Extract folder path (if in /tree/ or /blob/)
|
250 |
if "tree" in path_parts or "blob" in path_parts:
|
251 |
try:
|
252 |
branch_idx = path_parts.index("tree") if "tree" in path_parts else path_parts.index("blob")
|
253 |
folder_path = "/".join(path_parts[branch_idx + 2:]) # Skip "tree/main" or "blob/main"
|
254 |
+
revision = path_parts[branch_idx + 1]
|
255 |
except (ValueError, IndexError):
|
256 |
folder_path = None
|
257 |
else:
|
258 |
folder_path = None
|
259 |
|
260 |
+
return repo_id, folder_path, revision
|
261 |
|
262 |
|
263 |
def validate_directory(fs: HfFileSystem, repo_id: str, dirname: str, curr_tm: str, circuit_level:Literal['edge', 'node','neuron']='edge'):
|
|
|
321 |
path = hf_repo
|
322 |
level = level
|
323 |
|
324 |
+
repo_id, folder_path, revision = parse_huggingface_url(hf_repo)
|
325 |
+
|
326 |
+
folder_path = repo_id + "/" + folder_path
|
327 |
try:
|
328 |
+
files = fs.listdir(folder_path, revision=revision)
|
329 |
except Exception as e:
|
330 |
errors.append(f"Could not open Huggingface URL: {e}")
|
331 |
return errors, warnings
|
src/submission/submit.py
CHANGED
@@ -20,15 +20,17 @@ USERS_TO_SUBMISSION_DATES = None
|
|
20 |
def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email, _id):
|
21 |
errors = []
|
22 |
hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
|
|
|
|
|
23 |
try:
|
24 |
-
|
25 |
-
user_name, repo_name = repo_info.split("/")[:2]
|
26 |
except Exception as e:
|
27 |
errors.append("Error processing HF URL: could not get username and repo name")
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
32 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
33 |
|
34 |
if not errors:
|
@@ -84,7 +86,7 @@ def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_positio
|
|
84 |
if errors:
|
85 |
status = gr.Textbox("\n".join(f"β {e}" for e in errors), visible=True)
|
86 |
else:
|
87 |
-
status = gr.Textbox(f"β
Submission received! Your ID is \"{_id}\".
|
88 |
return [
|
89 |
status,
|
90 |
None, None,
|
|
|
20 |
def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_position, code_upload, method_name, contact_email, _id):
|
21 |
errors = []
|
22 |
hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
|
23 |
+
repo_id, folder_path, revision = parse_huggingface_url(hf_repo)
|
24 |
+
|
25 |
try:
|
26 |
+
user_name, repo_name = repo_id.split("/")
|
|
|
27 |
except Exception as e:
|
28 |
errors.append("Error processing HF URL: could not get username and repo name")
|
29 |
+
if revision is None or revision == "main":
|
30 |
+
try:
|
31 |
+
commit_hash = API.list_repo_commits(repo_id)[0].commit_id
|
32 |
+
except Exception as e:
|
33 |
+
errors.append("Could not get commit hash of provided Huggingface repo")
|
34 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
35 |
|
36 |
if not errors:
|
|
|
86 |
if errors:
|
87 |
status = gr.Textbox("\n".join(f"β {e}" for e in errors), visible=True)
|
88 |
else:
|
89 |
+
status = gr.Textbox(f"β
Submission received! Your submission ID is \"{_id}\". Save this so that you can manage your submission on the queue.", visible=True)
|
90 |
return [
|
91 |
status,
|
92 |
None, None,
|