Spaces:
Running
Running
Aaron Mueller
commited on
Commit
Β·
e1faa87
1
Parent(s):
f59c752
updates for causal variable track
Browse files- app.py +28 -24
- src/about.py +8 -7
- src/display/utils.py +1 -0
- src/leaderboard/read_evals.py +28 -16
- src/populate.py +4 -3
- src/submission/check_validity.py +342 -11
- src/submission/submit.py +2 -6
app.py
CHANGED
@@ -303,13 +303,15 @@ LEADERBOARD_DF_MIB_CAUSALGRAPH_DETAILED, LEADERBOARD_DF_MIB_CAUSALGRAPH_AGGREGAT
|
|
303 |
(
|
304 |
finished_eval_queue_df_subgraph,
|
305 |
pending_eval_queue_df_subgraph,
|
306 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_SUBGRAPH, EVAL_COLS)
|
307 |
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
|
|
|
|
|
313 |
|
314 |
def init_leaderboard_mib_subgraph(dataframe, track):
|
315 |
"""Initialize the subgraph leaderboard with display names for better readability."""
|
@@ -800,6 +802,7 @@ with demo:
|
|
800 |
# with gr.Group(visible=False) as causal_ui:
|
801 |
with gr.Column(visible=False, elem_id="bordered-column") as causal_ui:
|
802 |
gr.Markdown(EVALUATION_QUEUE_TEXT_CAUSALVARIABLE, elem_classes="markdown-text")
|
|
|
803 |
with gr.Row():
|
804 |
layer = gr.Number(
|
805 |
label="Layer Number",
|
@@ -813,15 +816,16 @@ with demo:
|
|
813 |
minimum=0,
|
814 |
info="Integer specifying token position"
|
815 |
)
|
|
|
816 |
with gr.Row():
|
817 |
hf_repo_cg = gr.Textbox(
|
818 |
label="HuggingFace Repository URL",
|
819 |
placeholder="https://huggingface.co/username/repo/path",
|
820 |
info="Must be a valid HuggingFace URL pointing to a file containing the trained featurizer (.pt). " )
|
821 |
-
code_upload = gr.File(
|
822 |
-
|
823 |
-
|
824 |
-
)
|
825 |
|
826 |
# Common fields
|
827 |
with gr.Group():
|
@@ -843,7 +847,7 @@ with demo:
|
|
843 |
# Submission handling
|
844 |
status = gr.Textbox(label="Submission Status", visible=False)
|
845 |
|
846 |
-
def handle_submission(track, hf_repo_circ, hf_repo_cg, level,
|
847 |
errors = []
|
848 |
warnings = []
|
849 |
|
@@ -856,7 +860,7 @@ with demo:
|
|
856 |
errors.append("Method name is required")
|
857 |
if "@" not in contact_email or "." not in contact_email:
|
858 |
errors.append("Valid email address is required")
|
859 |
-
if not level:
|
860 |
errors.append("Level of granularity is required")
|
861 |
|
862 |
if not hf_repo.startswith("https://huggingface.co/") and not hf_repo.startswith("http://huggingface.co/"):
|
@@ -880,12 +884,12 @@ with demo:
|
|
880 |
submission_errors, submission_warnings = verify_circuit_submission(hf_repo, level)
|
881 |
|
882 |
elif not breaking_error:
|
883 |
-
if not (isinstance(layer, int) and isinstance(token_position, int)):
|
884 |
-
|
885 |
-
if not code_upload:
|
886 |
-
|
887 |
|
888 |
-
submission_errors, submission_warnings = verify_causal_variable_submission(hf_repo
|
889 |
|
890 |
if not breaking_error:
|
891 |
errors.extend(submission_errors)
|
@@ -901,12 +905,12 @@ with demo:
|
|
901 |
elif warnings:
|
902 |
return [
|
903 |
gr.Textbox("Warnings:", visible=True),
|
904 |
-
gr.Markdown("\n".join(f"β’ {w}" for w in warnings)),
|
905 |
-
(track, hf_repo_circ, hf_repo_cg, level,
|
906 |
gr.Column(visible=True)
|
907 |
]
|
908 |
else:
|
909 |
-
return upload_to_queue(track, hf_repo_circ, hf_repo_cg, level,
|
910 |
|
911 |
# New warning confirmation dialog
|
912 |
warning_modal = gr.Column(visible=False, variant="panel")
|
@@ -922,7 +926,7 @@ with demo:
|
|
922 |
submit_btn = gr.Button("Submit Entry", variant="primary")
|
923 |
submit_btn.click(
|
924 |
handle_submission,
|
925 |
-
inputs=[track, hf_repo_circ, hf_repo_cg, level,
|
926 |
outputs=[status, warning_display, pending_submission, warning_modal]
|
927 |
)
|
928 |
|
@@ -939,24 +943,24 @@ with demo:
|
|
939 |
|
940 |
with gr.Column():
|
941 |
with gr.Accordion(
|
942 |
-
f"β
Finished Evaluations ({len(
|
943 |
open=False,
|
944 |
):
|
945 |
with gr.Row():
|
946 |
finished_eval_table = gr.components.Dataframe(
|
947 |
-
value=
|
948 |
headers=EVAL_COLS,
|
949 |
datatype=EVAL_TYPES,
|
950 |
row_count=5,
|
951 |
)
|
952 |
|
953 |
with gr.Accordion(
|
954 |
-
f"β³ Pending Evaluation Queue ({len(
|
955 |
open=False,
|
956 |
):
|
957 |
with gr.Row():
|
958 |
pending_eval_table = gr.components.Dataframe(
|
959 |
-
value=
|
960 |
headers=EVAL_COLS,
|
961 |
datatype=EVAL_TYPES,
|
962 |
row_count=5,
|
|
|
303 |
(
|
304 |
finished_eval_queue_df_subgraph,
|
305 |
pending_eval_queue_df_subgraph,
|
306 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_SUBGRAPH, EVAL_COLS, "Circuit")
|
307 |
|
308 |
+
(
|
309 |
+
finished_eval_queue_df_causalvariable,
|
310 |
+
pending_eval_queue_df_causalvariable,
|
311 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_CAUSALGRAPH, EVAL_COLS, "Causal Variable")
|
312 |
|
313 |
+
finished_eval_queue = pd.concat((finished_eval_queue_df_subgraph, finished_eval_queue_df_causalvariable))
|
314 |
+
pending_eval_queue = pd.concat((pending_eval_queue_df_subgraph, pending_eval_queue_df_causalvariable))
|
315 |
|
316 |
def init_leaderboard_mib_subgraph(dataframe, track):
|
317 |
"""Initialize the subgraph leaderboard with display names for better readability."""
|
|
|
802 |
# with gr.Group(visible=False) as causal_ui:
|
803 |
with gr.Column(visible=False, elem_id="bordered-column") as causal_ui:
|
804 |
gr.Markdown(EVALUATION_QUEUE_TEXT_CAUSALVARIABLE, elem_classes="markdown-text")
|
805 |
+
"""
|
806 |
with gr.Row():
|
807 |
layer = gr.Number(
|
808 |
label="Layer Number",
|
|
|
816 |
minimum=0,
|
817 |
info="Integer specifying token position"
|
818 |
)
|
819 |
+
"""
|
820 |
with gr.Row():
|
821 |
hf_repo_cg = gr.Textbox(
|
822 |
label="HuggingFace Repository URL",
|
823 |
placeholder="https://huggingface.co/username/repo/path",
|
824 |
info="Must be a valid HuggingFace URL pointing to a file containing the trained featurizer (.pt). " )
|
825 |
+
# code_upload = gr.File(
|
826 |
+
# label="Upload Python file implementing your featurization function",
|
827 |
+
# file_types=[".py"],
|
828 |
+
# )
|
829 |
|
830 |
# Common fields
|
831 |
with gr.Group():
|
|
|
847 |
# Submission handling
|
848 |
status = gr.Textbox(label="Submission Status", visible=False)
|
849 |
|
850 |
+
def handle_submission(track, hf_repo_circ, hf_repo_cg, level, method_name, contact_email):
|
851 |
errors = []
|
852 |
warnings = []
|
853 |
|
|
|
860 |
errors.append("Method name is required")
|
861 |
if "@" not in contact_email or "." not in contact_email:
|
862 |
errors.append("Valid email address is required")
|
863 |
+
if "Circuit" in track and not level:
|
864 |
errors.append("Level of granularity is required")
|
865 |
|
866 |
if not hf_repo.startswith("https://huggingface.co/") and not hf_repo.startswith("http://huggingface.co/"):
|
|
|
884 |
submission_errors, submission_warnings = verify_circuit_submission(hf_repo, level)
|
885 |
|
886 |
elif not breaking_error:
|
887 |
+
# if not (isinstance(layer, int) and isinstance(token_position, int)):
|
888 |
+
# errors.append("Layer and token position must be integers")
|
889 |
+
# if not code_upload:
|
890 |
+
# errors.append("Code file upload is required")
|
891 |
|
892 |
+
submission_errors, submission_warnings = verify_causal_variable_submission(hf_repo)
|
893 |
|
894 |
if not breaking_error:
|
895 |
errors.extend(submission_errors)
|
|
|
905 |
elif warnings:
|
906 |
return [
|
907 |
gr.Textbox("Warnings:", visible=True),
|
908 |
+
gr.Markdown("\n\n".join(f"β’ {w}" for w in warnings)),
|
909 |
+
(track, hf_repo_circ, hf_repo_cg, level, method_name, contact_email, _id),
|
910 |
gr.Column(visible=True)
|
911 |
]
|
912 |
else:
|
913 |
+
return upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, method_name, contact_email, _id)
|
914 |
|
915 |
# New warning confirmation dialog
|
916 |
warning_modal = gr.Column(visible=False, variant="panel")
|
|
|
926 |
submit_btn = gr.Button("Submit Entry", variant="primary")
|
927 |
submit_btn.click(
|
928 |
handle_submission,
|
929 |
+
inputs=[track, hf_repo_circ, hf_repo_cg, level, method_name, contact_email],
|
930 |
outputs=[status, warning_display, pending_submission, warning_modal]
|
931 |
)
|
932 |
|
|
|
943 |
|
944 |
with gr.Column():
|
945 |
with gr.Accordion(
|
946 |
+
f"β
Finished Evaluations ({len(finished_eval_queue)})",
|
947 |
open=False,
|
948 |
):
|
949 |
with gr.Row():
|
950 |
finished_eval_table = gr.components.Dataframe(
|
951 |
+
value=finished_eval_queue,
|
952 |
headers=EVAL_COLS,
|
953 |
datatype=EVAL_TYPES,
|
954 |
row_count=5,
|
955 |
)
|
956 |
|
957 |
with gr.Accordion(
|
958 |
+
f"β³ Pending Evaluation Queue ({len(pending_eval_queue)})",
|
959 |
open=False,
|
960 |
):
|
961 |
with gr.Row():
|
962 |
pending_eval_table = gr.components.Dataframe(
|
963 |
+
value=pending_eval_queue,
|
964 |
headers=EVAL_COLS,
|
965 |
datatype=EVAL_TYPES,
|
966 |
row_count=5,
|
src/about.py
CHANGED
@@ -145,19 +145,20 @@ EVALUATION_QUEUE_TEXT_CAUSALVARIABLE = """
|
|
145 |
|
146 |
### 1. Collect your materials
|
147 |
You'll need the following:
|
148 |
-
*
|
149 |
-
* A python
|
150 |
-
*
|
151 |
-
* A hypothesized feature location.
|
152 |
|
153 |
### 2. Upload your materials
|
154 |
Create a HuggingFace repository, and create a folder in that repository that will hold all of your materials.
|
155 |
-
At the URL you provide, each of the above materials should be present.
|
156 |
-
|
|
|
|
|
157 |
|
158 |
### 3. Manage your submission in the queue
|
159 |
If your submission passes all checks, it will be added to the queue. You will receive a submission ID here when you do this; be sure to save it!
|
160 |
-
This will allow you to remove your submission from the queue (e.g., if you find a bug
|
161 |
next week to resubmit.
|
162 |
|
163 |
Before your submission has been validated by our backend, it will have the "PREVALIDATION" status in the queue. Once it has been validated, it will have the "PENDING" status.
|
|
|
145 |
|
146 |
### 1. Collect your materials
|
147 |
You'll need the following:
|
148 |
+
* Trained featurizer, inverse featurizer, and indices objects.
|
149 |
+
* A python file containing the implementation of your featurizer and inverse featurizer.
|
150 |
+
* (Optional) Dynamic token alignment functions, provided in another python file.
|
|
|
151 |
|
152 |
### 2. Upload your materials
|
153 |
Create a HuggingFace repository, and create a folder in that repository that will hold all of your materials.
|
154 |
+
At the URL you provide (we'll call this the "root"), each of the above materials should be present. At the linked folder,
|
155 |
+
we will take the first python script lexicographically at the root as the featurizer script. Within that folder, we expect
|
156 |
+
one subfolder per model/task/causal variable triplet. Each subfolder should contain the trained featurizer, inverse featurizer,
|
157 |
+
and indices.
|
158 |
|
159 |
### 3. Manage your submission in the queue
|
160 |
If your submission passes all checks, it will be added to the queue. You will receive a submission ID here when you do this; be sure to save it!
|
161 |
+
This will allow you to remove your submission from the queue (e.g., if you find a bug). This will prevent you from needing to wait until
|
162 |
next week to resubmit.
|
163 |
|
164 |
Before your submission has been validated by our backend, it will have the "PREVALIDATION" status in the queue. Once it has been validated, it will have the "PENDING" status.
|
src/display/utils.py
CHANGED
@@ -192,6 +192,7 @@ AutoEvalColumn_mib_causalgraph = make_dataclass(
|
|
192 |
## For the queue columns in the submission tab
|
193 |
@dataclass(frozen=True)
|
194 |
class EvalQueueColumn: # Queue column
|
|
|
195 |
method_name = ColumnContent("method_name", "str", True)
|
196 |
repo_id = ColumnContent("hf_repo", "markdown", True)
|
197 |
revision = ColumnContent("revision", "str", True)
|
|
|
192 |
## For the queue columns in the submission tab
|
193 |
@dataclass(frozen=True)
|
194 |
class EvalQueueColumn: # Queue column
|
195 |
+
track_name = ColumnContent("track", "str", True)
|
196 |
method_name = ColumnContent("method_name", "str", True)
|
197 |
repo_id = ColumnContent("hf_repo", "markdown", True)
|
198 |
revision = ColumnContent("revision", "str", True)
|
src/leaderboard/read_evals.py
CHANGED
@@ -202,16 +202,6 @@ def get_raw_eval_results_mib_subgraph(results_path: str) -> List[EvalResult_MIB_
|
|
202 |
return eval_results
|
203 |
|
204 |
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
216 |
"""
|
217 |
Aggregates rows with the same base method name by taking the max value for each column.
|
@@ -462,7 +452,7 @@ def get_raw_eval_results_mib_causalgraph(results_path: str) -> Tuple[pd.DataFram
|
|
462 |
|
463 |
# Create the detailed DataFrame for highest accuracy
|
464 |
highest_records = list(highest_method_groups.values())
|
465 |
-
detailed_df_highest = pd.DataFrame(highest_records)
|
466 |
|
467 |
# Process mean accuracy results
|
468 |
# Group results by method
|
@@ -482,7 +472,7 @@ def get_raw_eval_results_mib_causalgraph(results_path: str) -> Tuple[pd.DataFram
|
|
482 |
|
483 |
# Create the detailed DataFrame for mean accuracy
|
484 |
mean_records = list(mean_method_groups.values())
|
485 |
-
detailed_df_mean = pd.DataFrame(mean_records)
|
486 |
|
487 |
if detailed_df_highest.empty or detailed_df_mean.empty:
|
488 |
return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
@@ -492,17 +482,39 @@ def get_raw_eval_results_mib_causalgraph(results_path: str) -> Tuple[pd.DataFram
|
|
492 |
score_columns_mean = [col for col in detailed_df_mean.columns if col not in ["eval_name", "Method"]]
|
493 |
|
494 |
if score_columns_highest:
|
495 |
-
detailed_df_highest["Average"] = detailed_df_highest[score_columns_highest].mean(axis=1).round(3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
496 |
|
497 |
if score_columns_mean:
|
498 |
-
detailed_df_mean["Average"] = detailed_df_mean[score_columns_mean].mean(axis=1).round(3)
|
|
|
|
|
|
|
|
|
|
|
|
|
499 |
|
500 |
# Sort by Average descending
|
501 |
if "Average" in detailed_df_highest.columns:
|
502 |
-
|
|
|
|
|
|
|
|
|
|
|
503 |
|
504 |
if "Average" in detailed_df_mean.columns:
|
505 |
-
detailed_df_mean =
|
|
|
|
|
|
|
|
|
506 |
|
507 |
# # Create intervention-averaged DataFrames for both metrics
|
508 |
# intervention_averaged_highest_df = create_intervention_averaged_df(detailed_df_highest)
|
|
|
202 |
return eval_results
|
203 |
|
204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
206 |
"""
|
207 |
Aggregates rows with the same base method name by taking the max value for each column.
|
|
|
452 |
|
453 |
# Create the detailed DataFrame for highest accuracy
|
454 |
highest_records = list(highest_method_groups.values())
|
455 |
+
detailed_df_highest = pd.DataFrame(highest_records).round(3).fillna("-")
|
456 |
|
457 |
# Process mean accuracy results
|
458 |
# Group results by method
|
|
|
472 |
|
473 |
# Create the detailed DataFrame for mean accuracy
|
474 |
mean_records = list(mean_method_groups.values())
|
475 |
+
detailed_df_mean = pd.DataFrame(mean_records).round(3).fillna("-")
|
476 |
|
477 |
if detailed_df_highest.empty or detailed_df_mean.empty:
|
478 |
return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
|
|
|
482 |
score_columns_mean = [col for col in detailed_df_mean.columns if col not in ["eval_name", "Method"]]
|
483 |
|
484 |
if score_columns_highest:
|
485 |
+
# detailed_df_highest["Average"] = detailed_df_highest[score_columns_highest].mean(axis=1).round(3)
|
486 |
+
# Check if any cell in the row contains '-'
|
487 |
+
has_dash = detailed_df_highest[score_columns_highest].eq('-').any(axis=1)
|
488 |
+
detailed_df_highest['Average'] = '-'
|
489 |
+
mask = ~has_dash
|
490 |
+
if mask.any():
|
491 |
+
numeric_values = detailed_df_highest.loc[mask, score_columns_highest].apply(pd.to_numeric)
|
492 |
+
detailed_df_highest.loc[mask, 'Average'] = numeric_values.mean(axis=1).round(3)
|
493 |
|
494 |
if score_columns_mean:
|
495 |
+
# detailed_df_mean["Average"] = detailed_df_mean[score_columns_mean].mean(axis=1).round(3)
|
496 |
+
has_dash = detailed_df_mean[score_columns_mean].eq('-').any(axis=1)
|
497 |
+
detailed_df_mean['Average'] = '-'
|
498 |
+
mask = ~has_dash
|
499 |
+
if mask.any():
|
500 |
+
numeric_values = detailed_df_mean.loc[mask, score_columns_mean].apply(pd.to_numeric)
|
501 |
+
detailed_df_mean.loc[mask, 'Average'] = numeric_values.mean(axis=1).round(3)
|
502 |
|
503 |
# Sort by Average descending
|
504 |
if "Average" in detailed_df_highest.columns:
|
505 |
+
# Convert '-' to NaN for sorting purposes
|
506 |
+
detailed_df_highest['Average'] = pd.to_numeric(detailed_df_highest['Average'], errors='coerce')
|
507 |
+
detailed_df_highest = detailed_df_highest.sort_values(by=['Average'], ascending=False, na_position='last')
|
508 |
+
# Convert NaN back to '-'
|
509 |
+
detailed_df_highest['Average'] = detailed_df_highest['Average'].fillna('-')
|
510 |
+
# detailed_df_highest = detailed_df_highest.sort_values("Average", ascending=False).round(3)
|
511 |
|
512 |
if "Average" in detailed_df_mean.columns:
|
513 |
+
detailed_df_mean['Average'] = pd.to_numeric(detailed_df_mean['Average'], errors='coerce')
|
514 |
+
detailed_df_mean = detailed_df_mean.sort_values(by=['Average'], ascending=False, na_position='last')
|
515 |
+
# Convert NaN back to '-'
|
516 |
+
detailed_df_mean['Average'] = detailed_df_mean['Average'].fillna('-')
|
517 |
+
# detailed_df_mean = detailed_df_mean.sort_values("Average", ascending=False).round(3)
|
518 |
|
519 |
# # Create intervention-averaged DataFrames for both metrics
|
520 |
# intervention_averaged_highest_df = create_intervention_averaged_df(detailed_df_highest)
|
src/populate.py
CHANGED
@@ -34,7 +34,6 @@ def get_leaderboard_df_mib_subgraph(results_path: str, cols: list, benchmark_col
|
|
34 |
|
35 |
|
36 |
|
37 |
-
|
38 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
39 |
"""Aggregates rows with the same base method name by taking the max value for each column"""
|
40 |
df_copy = df.copy()
|
@@ -139,11 +138,13 @@ def get_leaderboard_df_mib_causalgraph(results_path: str) -> Tuple[pd.DataFrame,
|
|
139 |
return detailed_df, aggregated_df, intervention_averaged_df
|
140 |
|
141 |
|
142 |
-
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
143 |
"""Creates the different dataframes for the evaluation queues requests"""
|
144 |
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
145 |
all_evals = []
|
146 |
|
|
|
|
|
147 |
for entry in entries:
|
148 |
if ".json" in entry:
|
149 |
file_path = os.path.join(save_path, entry)
|
@@ -175,7 +176,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
175 |
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL" or e["status"] == "FAILED"]
|
176 |
for list in (pending_list, finished_list):
|
177 |
for item in list:
|
178 |
-
item["track"] =
|
179 |
item["hf_repo"] = parse_huggingface_url(item["hf_repo"])[0]
|
180 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
181 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
|
|
34 |
|
35 |
|
36 |
|
|
|
37 |
def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
38 |
"""Aggregates rows with the same base method name by taking the max value for each column"""
|
39 |
df_copy = df.copy()
|
|
|
138 |
return detailed_df, aggregated_df, intervention_averaged_df
|
139 |
|
140 |
|
141 |
+
def get_evaluation_queue_df(save_path: str, cols: list, track: str) -> list[pd.DataFrame]:
|
142 |
"""Creates the different dataframes for the evaluation queues requests"""
|
143 |
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
144 |
all_evals = []
|
145 |
|
146 |
+
print(track)
|
147 |
+
|
148 |
for entry in entries:
|
149 |
if ".json" in entry:
|
150 |
file_path = os.path.join(save_path, entry)
|
|
|
176 |
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL" or e["status"] == "FAILED"]
|
177 |
for list in (pending_list, finished_list):
|
178 |
for item in list:
|
179 |
+
item["track"] = track
|
180 |
item["hf_repo"] = parse_huggingface_url(item["hf_repo"])[0]
|
181 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
182 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
src/submission/check_validity.py
CHANGED
@@ -8,7 +8,7 @@ import gradio as gr
|
|
8 |
from urllib.parse import urlparse
|
9 |
from collections import defaultdict
|
10 |
from datetime import datetime, timedelta, timezone
|
11 |
-
from typing import Literal
|
12 |
|
13 |
from huggingface_hub import HfApi, HfFileSystem, hf_hub_url, get_hf_file_metadata
|
14 |
from huggingface_hub import ModelCard
|
@@ -19,6 +19,162 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
19 |
from src.display.utils import TEXT_TASKS, VISION_TASKS, NUM_EXPECTED_EXAMPLES
|
20 |
from src.envs import EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
24 |
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
@@ -260,7 +416,7 @@ def parse_huggingface_url(url: str):
|
|
260 |
return repo_id, folder_path, revision
|
261 |
|
262 |
|
263 |
-
def
|
264 |
errors = []
|
265 |
warnings = []
|
266 |
|
@@ -307,9 +463,6 @@ def verify_circuit_submission(hf_repo, level, progress=gr.Progress()):
|
|
307 |
"arc-challenge_llama3"
|
308 |
]
|
309 |
|
310 |
-
TASKS = ["ioi", "mcqa", "arithmetic-addition", "arithmetic-subtraction", "arc-easy", "arc-challenge"]
|
311 |
-
MODELS = ["gpt2", "qwen2.5", "gemma2", "llama3", "interpbench"]
|
312 |
-
|
313 |
errors = []
|
314 |
warnings = []
|
315 |
|
@@ -321,10 +474,9 @@ def verify_circuit_submission(hf_repo, level, progress=gr.Progress()):
|
|
321 |
path = hf_repo
|
322 |
level = level
|
323 |
|
324 |
-
repo_id, folder_path, revision = parse_huggingface_url(hf_repo)
|
325 |
-
|
326 |
-
folder_path = repo_id + "/" + folder_path
|
327 |
try:
|
|
|
|
|
328 |
files = fs.listdir(folder_path, revision=revision)
|
329 |
except Exception as e:
|
330 |
errors.append(f"Could not open Huggingface URL: {e}")
|
@@ -361,7 +513,7 @@ def verify_circuit_submission(hf_repo, level, progress=gr.Progress()):
|
|
361 |
|
362 |
# Parse circuits directory
|
363 |
print(f"validating {circuit_dir}")
|
364 |
-
vd_errors, vd_warnings =
|
365 |
errors.extend(vd_errors)
|
366 |
warnings.extend(vd_warnings)
|
367 |
|
@@ -390,5 +542,184 @@ def verify_circuit_submission(hf_repo, level, progress=gr.Progress()):
|
|
390 |
return errors, warnings
|
391 |
|
392 |
|
393 |
-
def
|
394 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
from urllib.parse import urlparse
|
9 |
from collections import defaultdict
|
10 |
from datetime import datetime, timedelta, timezone
|
11 |
+
from typing import Literal, Tuple, Union
|
12 |
|
13 |
from huggingface_hub import HfApi, HfFileSystem, hf_hub_url, get_hf_file_metadata
|
14 |
from huggingface_hub import ModelCard
|
|
|
19 |
from src.display.utils import TEXT_TASKS, VISION_TASKS, NUM_EXPECTED_EXAMPLES
|
20 |
from src.envs import EVAL_REQUESTS_SUBGRAPH, EVAL_REQUESTS_CAUSALGRAPH
|
21 |
|
22 |
+
TASKS = ["ioi", "mcqa", "arithmetic-addition", "arithmetic-subtraction", "arc-easy", "arc-challenge"]
|
23 |
+
MODELS = ["gpt2", "qwen2.5", "gemma2", "llama3", "interpbench"]
|
24 |
+
|
25 |
+
class FeaturizerValidator:
|
26 |
+
def __init__(self, base_featurizer_class):
|
27 |
+
self.base_featurizer_class = base_featurizer_class
|
28 |
+
self.featurizer_class_name = None
|
29 |
+
|
30 |
+
# torch.nn.Module
|
31 |
+
self.module_value, self.module_attr = "torch", "Module"
|
32 |
+
self.featurizer_module_class_name_1 = None
|
33 |
+
self.featurizer_module_class_name_2 = None
|
34 |
+
|
35 |
+
|
36 |
+
def find_featurizer_subclass(self, module_path: str) -> Tuple[bool, Union[str, None]]:
|
37 |
+
"""
|
38 |
+
Finds the first class in the module that inherits from Featurizer.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
module_path: Path to the uploaded Python file
|
42 |
+
|
43 |
+
Returns:
|
44 |
+
Tuple of (success, class_name, message)
|
45 |
+
"""
|
46 |
+
# First try with AST for safety
|
47 |
+
try:
|
48 |
+
with open(module_path, 'r') as file:
|
49 |
+
tree = ast.parse(file.read(), filename=module_path)
|
50 |
+
|
51 |
+
for node in ast.walk(tree):
|
52 |
+
if isinstance(node, ast.ClassDef):
|
53 |
+
for base in node.bases:
|
54 |
+
if isinstance(base, ast.Name) and base.id == self.base_featurizer_class.__name__:
|
55 |
+
return True, node.name, f"Found class '{node.name}' that inherits from {self.base_featurizer_class.__name__}"
|
56 |
+
|
57 |
+
return False, None, f"No class inheriting from {self.base_featurizer_class.__name__} found"
|
58 |
+
|
59 |
+
except Exception as e:
|
60 |
+
return False, None, f"Error during static analysis: {str(e)}"
|
61 |
+
|
62 |
+
|
63 |
+
def find_featurizer_module_classes(self, module_path: str) -> Tuple[bool, Union[str, None]]:
|
64 |
+
try:
|
65 |
+
with open(module_path, 'r') as file:
|
66 |
+
tree = ast.parse(file.read(), filename=module_path)
|
67 |
+
|
68 |
+
for node in ast.walk(tree):
|
69 |
+
if isinstance(node, ast.ClassDef):
|
70 |
+
for base in node.bases:
|
71 |
+
if (isinstance(base, ast.Attribute) and base.attr == self.module_attr):
|
72 |
+
if self.featurizer_module_class_name_1 is None:
|
73 |
+
self.featurizer_module_class_name_1 = node.name
|
74 |
+
else:
|
75 |
+
self.featurizer_module_class_name_2 = node.name
|
76 |
+
return True, f"Found two featurizer modules: {self.featurizer_module_class_name_1}, {self.featurizer_module_class_name_2}"
|
77 |
+
|
78 |
+
if self.featurizer_module_class_name_1:
|
79 |
+
return True, f"Found one featurizer module: {self.featurizer_module_class_name_1}"
|
80 |
+
return False, f"Found no featurizer modules."
|
81 |
+
|
82 |
+
except Exception as e:
|
83 |
+
return False, f"Error during static analysis: {e}"
|
84 |
+
|
85 |
+
|
86 |
+
def validate_uploaded_module(self, module_path: str) -> Tuple[bool, str]:
|
87 |
+
"""
|
88 |
+
Validates an uploaded module to ensure it properly extends the Featurizer class.
|
89 |
+
|
90 |
+
Args:
|
91 |
+
module_path: Path to the uploaded Python file
|
92 |
+
class_name: Name of the class to validate
|
93 |
+
|
94 |
+
Returns:
|
95 |
+
Tuple of (is_valid, message)
|
96 |
+
"""
|
97 |
+
# First, find the name of the featurizer class we're verifying
|
98 |
+
found, class_name, message = self.find_featurizer_subclass(module_path)
|
99 |
+
if not found:
|
100 |
+
return False, message
|
101 |
+
else:
|
102 |
+
print("Verified featurizer subclass.")
|
103 |
+
|
104 |
+
# Second, find the name of the featurizer and inverse featurizer modules
|
105 |
+
modules_found, modules_message = self.find_featurizer_module_classes(module_path)
|
106 |
+
if not modules_found:
|
107 |
+
return False, modules_message
|
108 |
+
else:
|
109 |
+
print(f"Verified featurizer module(s): {modules_message}")
|
110 |
+
|
111 |
+
# Then, perform static code analysis on the featurizer class for basic safety
|
112 |
+
inheritance_check, ast_message = self._verify_inheritance_with_ast(module_path, class_name)
|
113 |
+
if not inheritance_check:
|
114 |
+
return False, ast_message
|
115 |
+
|
116 |
+
# Then, try to load and validate the featurizer class
|
117 |
+
return self._verify_inheritance_with_import(module_path, class_name)
|
118 |
+
|
119 |
+
# TODO: try directly loading featurizer module and inverse featurizer module?
|
120 |
+
|
121 |
+
|
122 |
+
def _verify_inheritance_with_ast(self, module_path: str, class_name: str) -> Tuple[bool, str]:
|
123 |
+
"""Verify inheritance using AST without executing code"""
|
124 |
+
try:
|
125 |
+
with open(module_path, 'r') as file:
|
126 |
+
tree = ast.parse(file.read(), filename=module_path)
|
127 |
+
|
128 |
+
# Look for class definitions that match the target class name
|
129 |
+
for node in ast.walk(tree):
|
130 |
+
if isinstance(node, ast.ClassDef) and node.name == class_name:
|
131 |
+
# Check if any base class name matches 'Featurizer'
|
132 |
+
for base in node.bases:
|
133 |
+
if isinstance(base, ast.Name) and base.id == self.base_featurizer_class.__name__:
|
134 |
+
return True, "Static analysis indicates proper inheritance"
|
135 |
+
|
136 |
+
return False, f"Class '{class_name}' does not appear to inherit from {self.base_featurizer_class.__name__}"
|
137 |
+
|
138 |
+
return False, f"Class '{class_name}' not found in the uploaded module"
|
139 |
+
|
140 |
+
except Exception as e:
|
141 |
+
return False, f"Error during static analysis: {str(e)}"
|
142 |
+
|
143 |
+
|
144 |
+
def _verify_inheritance_with_import(self, module_path: str, class_name: str) -> Tuple[bool, str]:
|
145 |
+
"""Safely import the module and verify inheritance using Python's introspection"""
|
146 |
+
try:
|
147 |
+
# Dynamically import the module
|
148 |
+
spec = importlib.util.spec_from_file_location("uploaded_module", module_path)
|
149 |
+
if spec is None or spec.loader is None:
|
150 |
+
return False, "Could not load the module specification"
|
151 |
+
|
152 |
+
uploaded_module = importlib.util.module_from_spec(spec)
|
153 |
+
spec.loader.exec_module(uploaded_module)
|
154 |
+
|
155 |
+
# Get the class from the module
|
156 |
+
if not hasattr(uploaded_module, class_name):
|
157 |
+
return False, f"Class '{class_name}' not found in the uploaded module"
|
158 |
+
|
159 |
+
uploaded_class = getattr(uploaded_module, class_name)
|
160 |
+
|
161 |
+
# Check if it's a proper subclass
|
162 |
+
if not inspect.isclass(uploaded_class):
|
163 |
+
return False, f"'{class_name}' is not a class"
|
164 |
+
|
165 |
+
if not issubclass(uploaded_class, self.base_featurizer_class):
|
166 |
+
return False, f"'{class_name}' does not inherit from {self.base_featurizer_class.__name__}"
|
167 |
+
|
168 |
+
# Optional: Check method resolution order
|
169 |
+
mro = inspect.getmro(uploaded_class)
|
170 |
+
if self.base_featurizer_class not in mro:
|
171 |
+
return False, f"{self.base_featurizer_class.__name__} not in the method resolution order"
|
172 |
+
|
173 |
+
return True, f"Class '{class_name}' properly extends {self.base_featurizer_class.__name__}"
|
174 |
+
|
175 |
+
except Exception as e:
|
176 |
+
return False, f"Error during dynamic validation: {str(e)}"
|
177 |
+
|
178 |
|
179 |
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
180 |
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
|
|
416 |
return repo_id, folder_path, revision
|
417 |
|
418 |
|
419 |
+
def validate_directory_circuit(fs: HfFileSystem, repo_id: str, dirname: str, curr_tm: str, circuit_level:Literal['edge', 'node','neuron']='edge'):
|
420 |
errors = []
|
421 |
warnings = []
|
422 |
|
|
|
463 |
"arc-challenge_llama3"
|
464 |
]
|
465 |
|
|
|
|
|
|
|
466 |
errors = []
|
467 |
warnings = []
|
468 |
|
|
|
474 |
path = hf_repo
|
475 |
level = level
|
476 |
|
|
|
|
|
|
|
477 |
try:
|
478 |
+
repo_id, folder_path, revision = parse_huggingface_url(hf_repo)
|
479 |
+
folder_path = repo_id + "/" + folder_path
|
480 |
files = fs.listdir(folder_path, revision=revision)
|
481 |
except Exception as e:
|
482 |
errors.append(f"Could not open Huggingface URL: {e}")
|
|
|
513 |
|
514 |
# Parse circuits directory
|
515 |
print(f"validating {circuit_dir}")
|
516 |
+
vd_errors, vd_warnings = validate_directory_circuit(fs, repo_id, circuit_dir, curr_tm, level)
|
517 |
errors.extend(vd_errors)
|
518 |
warnings.extend(vd_warnings)
|
519 |
|
|
|
542 |
return errors, warnings
|
543 |
|
544 |
|
545 |
+
def validate_directory_causalgraph(fs: HfFileSystem, repo_id: str, dirname: str):
|
546 |
+
errors = []
|
547 |
+
warnings = []
|
548 |
+
|
549 |
+
files = fs.ls(dirname)
|
550 |
+
files = [f["name"] for f in files if "_featurizer" in f["name"] or "_indices" in f["name"]]
|
551 |
+
|
552 |
+
valid_triplet = False
|
553 |
+
|
554 |
+
offset = 0
|
555 |
+
for idx, file in enumerate(files):
|
556 |
+
file_suffix = file.split(repo_id + "/")[1]
|
557 |
+
file_url = hf_hub_url(repo_id=repo_id, filename=file_suffix)
|
558 |
+
file_info = get_hf_file_metadata(file_url)
|
559 |
+
file_size_mb = file_info.size / (1024 * 1024)
|
560 |
+
if file_size_mb > 150:
|
561 |
+
warnings.append(f"Will skip file >150MB: {file}")
|
562 |
+
offset -= 1
|
563 |
+
continue
|
564 |
+
|
565 |
+
if idx + offset > 30:
|
566 |
+
warnings.append("Many files in directory; stopping at 30")
|
567 |
+
break
|
568 |
+
|
569 |
+
if file.endswith("_featurizer") or file.endswith("_indices"):
|
570 |
+
prefix = "_".join(file.split("_")[:-1])
|
571 |
+
this_suffix = "_" + file.split("_")[-1]
|
572 |
+
suffixes = ("_featurizer", "_inverse_featurizer", "_indices")
|
573 |
+
for idx, suffix in enumerate(suffixes):
|
574 |
+
if file.replace(this_suffix, suffix) not in files:
|
575 |
+
warnings.append(f"For {prefix}, found a {this_suffix} file but no associated {suffix}")
|
576 |
+
break
|
577 |
+
if idx == len(suffixes) - 1:
|
578 |
+
valid_triplet = True
|
579 |
+
if valid_triplet:
|
580 |
+
found_submodule = False
|
581 |
+
found_layer = False
|
582 |
+
found_token = False
|
583 |
+
if "residual" or "attention" in prefix.lower():
|
584 |
+
found_submodule = True
|
585 |
+
if "layer:" in prefix.lower():
|
586 |
+
found_layer = True
|
587 |
+
if "token:" in prefix.lower():
|
588 |
+
found_token = True
|
589 |
+
if not found_submodule or not found_layer or not found_token:
|
590 |
+
errors.append("Could not derive where featurizer should be applied from featurizer filenames.")
|
591 |
+
|
592 |
+
if valid_triplet:
|
593 |
+
break
|
594 |
+
|
595 |
+
if not valid_triplet:
|
596 |
+
errors.append("No valid featurizer/inverse featurizer/indices triplets.")
|
597 |
+
return errors, warnings
|
598 |
+
|
599 |
+
|
600 |
+
def verify_causal_variable_submission(hf_repo, progress=gr.Progress()):
|
601 |
+
CV_TASKS = set(["ioi_task", "4_answer_MCQA", "ARC_easy", "arithmetic", "ravel_task"])
|
602 |
+
CV_TASK_VARIABLES = {"ioi_task": ["output_token", "output_position"],
|
603 |
+
"4_answer_MCQA": ["answer_pointer", "answer"],
|
604 |
+
"arc": ["answer_pointer", "answer"],
|
605 |
+
"arithmetic": ["ones_carry"],
|
606 |
+
"ravel_task": ["Country", "Continent", "Language"]}
|
607 |
+
CV_MODELS = set(["GPT2LMHeadModel", "Qwen2ForCausalLM", "Gemma2ForCausalLM", "LlamaForCausalLM"])
|
608 |
+
# create pairs of valid task/model combinations
|
609 |
+
CV_VALID_TASK_MODELS = set([("ioi_task", "GPT2LMHeadModel"),
|
610 |
+
("ioi_task", "Qwen2ForCausalLM"),
|
611 |
+
("ioi_task", "Gemma2ForCausalLM"),
|
612 |
+
("ioi_task", "LlamaForCausalLM"),
|
613 |
+
("4_answer_MCQA", "Qwen2ForCausalLM"),
|
614 |
+
("4_answer_MCQA", "Gemma2ForCausalLM"),
|
615 |
+
("4_answer_MCQA", "LlamaForCausalLM"),
|
616 |
+
("ARC_easy", "Gemma2ForCausalLM"),
|
617 |
+
("ARC_easy", "LlamaForCausalLM"),
|
618 |
+
("arithmetic", "Gemma2ForCausalLM"),
|
619 |
+
("arithmetic", "LlamaForCausalLM"),
|
620 |
+
("ravel_task", "Gemma2ForCausalLM"),
|
621 |
+
("ravel_task", "LlamaForCausalLM")])
|
622 |
+
|
623 |
+
errors = []
|
624 |
+
warnings = []
|
625 |
+
|
626 |
+
num_py_files = 0
|
627 |
+
directories_present = {tm: False for tm in CV_VALID_TASK_MODELS}
|
628 |
+
directories_valid = {tm: False for tm in CV_VALID_TASK_MODELS}
|
629 |
+
variables_valid = {}
|
630 |
+
|
631 |
+
fs = HfFileSystem()
|
632 |
+
|
633 |
+
path = hf_repo
|
634 |
+
|
635 |
+
try:
|
636 |
+
repo_id, folder_path, revision = parse_huggingface_url(hf_repo)
|
637 |
+
folder_path = repo_id + "/" + folder_path
|
638 |
+
files = fs.listdir(folder_path, revision=revision)
|
639 |
+
except Exception as e:
|
640 |
+
errors.append(f"Could not open Huggingface URL: {e}")
|
641 |
+
return errors, warnings
|
642 |
+
|
643 |
+
file_counts = 0
|
644 |
+
for file in progress.tqdm(files, desc="Validating files in repo"):
|
645 |
+
filename = file["name"]
|
646 |
+
file_counts += 1
|
647 |
+
if file_counts >= 30:
|
648 |
+
warnings.append("Folder contains many files/directories; stopped at 30.")
|
649 |
+
break
|
650 |
+
|
651 |
+
if filename.endswith(".py"):
|
652 |
+
num_py_files += 1
|
653 |
+
|
654 |
+
causalgraph_dir = filename
|
655 |
+
dirname_proc = causalgraph_dir.lower().split("/")[-1]
|
656 |
+
if not fs.isdir(causalgraph_dir):
|
657 |
+
continue
|
658 |
+
curr_task = None
|
659 |
+
curr_model = None
|
660 |
+
curr_variable = None
|
661 |
+
# Look for task names in filename
|
662 |
+
for task in CV_TASKS:
|
663 |
+
if dirname_proc.startswith(task.lower()) or f"_{task.lower()}" in dirname_proc:
|
664 |
+
curr_task = task
|
665 |
+
if curr_task not in variables_valid:
|
666 |
+
variables_valid[curr_task] = {v: False for v in CV_TASK_VARIABLES[curr_task]}
|
667 |
+
for variable in CV_TASK_VARIABLES[curr_task]:
|
668 |
+
if dirname_proc.startswith(variable.lower()) or f"_{variable.lower()}" in dirname_proc or f"_{variable.lower().replace('_', '-')}" in dirname_proc:
|
669 |
+
curr_variable = variable
|
670 |
+
break
|
671 |
+
# Look for model names in filename
|
672 |
+
for model in CV_MODELS:
|
673 |
+
if dirname_proc.startswith(model.lower()) or f"_{model.lower()}" in dirname_proc:
|
674 |
+
curr_model = model
|
675 |
+
if curr_task is not None and curr_model is not None and curr_variable is not None:
|
676 |
+
curr_tm = (curr_task, curr_model)
|
677 |
+
if curr_tm in CV_VALID_TASK_MODELS:
|
678 |
+
directories_present[curr_tm] = True
|
679 |
+
else:
|
680 |
+
continue
|
681 |
+
else:
|
682 |
+
continue
|
683 |
+
|
684 |
+
print(f"validating {causalgraph_dir}")
|
685 |
+
vd_errors, vd_warnings = validate_directory_causalgraph(fs, repo_id, causalgraph_dir)
|
686 |
+
errors.extend(vd_errors)
|
687 |
+
warnings.extend(vd_warnings)
|
688 |
+
|
689 |
+
if len(vd_errors) == 0:
|
690 |
+
directories_valid[curr_tm] = True
|
691 |
+
variables_valid[curr_task][curr_variable] = True
|
692 |
+
|
693 |
+
if num_py_files == 0:
|
694 |
+
errors.append("No featurizer .py file detected in root of provided repo.")
|
695 |
+
elif num_py_files == 1:
|
696 |
+
errors.append("Found one .py script, but expected two: one for the featurizer, and another for the token position functions.")
|
697 |
+
|
698 |
+
task_set, model_set = set(), set()
|
699 |
+
for tm in directories_present:
|
700 |
+
if not directories_present[tm]:
|
701 |
+
continue
|
702 |
+
if not directories_valid[tm]:
|
703 |
+
warnings.append(f"Directory found for {tm[0]}/{tm[1]}, but contents not valid")
|
704 |
+
continue
|
705 |
+
|
706 |
+
for tm in directories_valid:
|
707 |
+
if directories_valid[tm]:
|
708 |
+
task, model = tm
|
709 |
+
task_set.add(task)
|
710 |
+
model_set.add(model)
|
711 |
+
|
712 |
+
if len(task_set) == 0 or len(model_set) == 0:
|
713 |
+
errors.append("No valid directories found for any task/model.")
|
714 |
+
|
715 |
+
# no_tm_display = [f"{tm[0]}/{tm[1]}" for tm in directories_valid if not directories_valid[tm]]
|
716 |
+
# if len(no_tm_display) > 0:
|
717 |
+
# warnings.append(f"No valid submission found for the following tasks/models: {*no_tm_display,}")
|
718 |
+
|
719 |
+
for task in variables_valid:
|
720 |
+
found_variable_display = [v for v in variables_valid[task] if variables_valid[task][v]]
|
721 |
+
no_variable_display = [v for v in variables_valid[task] if not variables_valid[task][v]]
|
722 |
+
if no_variable_display:
|
723 |
+
warnings.append(f"For {task}, found variables {*found_variable_display,}, but not variables {*no_variable_display,}")
|
724 |
+
|
725 |
+
return errors, warnings
|
src/submission/submit.py
CHANGED
@@ -17,7 +17,7 @@ import gradio as gr
|
|
17 |
REQUESTED_MODELS = None
|
18 |
USERS_TO_SUBMISSION_DATES = None
|
19 |
|
20 |
-
def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level,
|
21 |
errors = []
|
22 |
hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
|
23 |
repo_id, folder_path, revision = parse_huggingface_url(hf_repo)
|
@@ -53,9 +53,6 @@ def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_positio
|
|
53 |
"hf_repo": hf_repo,
|
54 |
"user_name": user_name,
|
55 |
"revision": commit_hash,
|
56 |
-
"layer": layer,
|
57 |
-
"token_position": token_position,
|
58 |
-
"code_upload": code_upload,
|
59 |
"method_name": method_name,
|
60 |
"contact_email": contact_email.lower(),
|
61 |
"submit_time": current_time,
|
@@ -65,7 +62,6 @@ def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_positio
|
|
65 |
QUEUE_REPO = QUEUE_REPO_CAUSALGRAPH
|
66 |
EVAL_REQUESTS = EVAL_REQUESTS_CAUSALGRAPH
|
67 |
|
68 |
-
|
69 |
OUT_DIR = f"{EVAL_REQUESTS}/"
|
70 |
os.makedirs(OUT_DIR, exist_ok=True)
|
71 |
out_path = f"{OUT_DIR}/{method_name}_{_id}_{current_time}.json"
|
@@ -84,7 +80,7 @@ def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, layer, token_positio
|
|
84 |
errors.append(f"Could not upload entry to eval queue: {e}")
|
85 |
|
86 |
if errors:
|
87 |
-
status = gr.Textbox("\n".join(f"β {e}" for e in errors), visible=True)
|
88 |
else:
|
89 |
status = gr.Textbox(f"β
Submission received! Your submission ID is \"{_id}\". Save this so that you can manage your submission on the queue.", visible=True)
|
90 |
return [
|
|
|
17 |
REQUESTED_MODELS = None
|
18 |
USERS_TO_SUBMISSION_DATES = None
|
19 |
|
20 |
+
def upload_to_queue(track, hf_repo_circ, hf_repo_cg, level, method_name, contact_email, _id):
|
21 |
errors = []
|
22 |
hf_repo = hf_repo_circ if "Circuit" in track else hf_repo_cg
|
23 |
repo_id, folder_path, revision = parse_huggingface_url(hf_repo)
|
|
|
53 |
"hf_repo": hf_repo,
|
54 |
"user_name": user_name,
|
55 |
"revision": commit_hash,
|
|
|
|
|
|
|
56 |
"method_name": method_name,
|
57 |
"contact_email": contact_email.lower(),
|
58 |
"submit_time": current_time,
|
|
|
62 |
QUEUE_REPO = QUEUE_REPO_CAUSALGRAPH
|
63 |
EVAL_REQUESTS = EVAL_REQUESTS_CAUSALGRAPH
|
64 |
|
|
|
65 |
OUT_DIR = f"{EVAL_REQUESTS}/"
|
66 |
os.makedirs(OUT_DIR, exist_ok=True)
|
67 |
out_path = f"{OUT_DIR}/{method_name}_{_id}_{current_time}.json"
|
|
|
80 |
errors.append(f"Could not upload entry to eval queue: {e}")
|
81 |
|
82 |
if errors:
|
83 |
+
status = gr.Textbox("\n\n".join(f"β {e}" for e in errors), visible=True)
|
84 |
else:
|
85 |
status = gr.Textbox(f"β
Submission received! Your submission ID is \"{_id}\". Save this so that you can manage your submission on the queue.", visible=True)
|
86 |
return [
|