Spaces:
Running
Running
Aaron Mueller
commited on
Commit
·
b624a39
1
Parent(s):
44212b3
prettier leaderboard; draft of submission tab
Browse files- app.py +136 -7
- caulsal_metric.py +3 -3
- src/about.py +11 -11
- src/leaderboard/read_evals.py +21 -6
- src/populate.py +2 -2
app.py
CHANGED
@@ -122,6 +122,7 @@ from gradio_leaderboard import SelectColumns, Leaderboard
|
|
122 |
import pandas as pd
|
123 |
from typing import List, Dict, Optional
|
124 |
from dataclasses import fields
|
|
|
125 |
|
126 |
class SmartSelectColumns(SelectColumns):
|
127 |
"""
|
@@ -270,7 +271,11 @@ try:
|
|
270 |
except Exception:
|
271 |
restart_space()
|
272 |
|
273 |
-
|
|
|
|
|
|
|
|
|
274 |
|
275 |
LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
|
276 |
LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
|
@@ -486,6 +491,12 @@ def process_json(temp_file):
|
|
486 |
gr.Markdown("Upload successful!")
|
487 |
return data
|
488 |
|
|
|
|
|
|
|
|
|
|
|
|
|
489 |
|
490 |
# Define the preset substrings for filtering
|
491 |
PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
@@ -546,13 +557,23 @@ def update_leaderboard(dataframe: pd.DataFrame, selected_task_substrings: List[s
|
|
546 |
else:
|
547 |
show_average = False
|
548 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
549 |
if show_average:
|
550 |
means = filtered_dataframe.replace("-", float("nan")).mean(axis=1, skipna=False)
|
|
|
|
|
551 |
filtered_dataframe["Average"] = means.round(2)
|
|
|
552 |
filtered_dataframe = filtered_dataframe.sort_values(by=["Average"], ascending=False, na_position='last')
|
553 |
filtered_dataframe = filtered_dataframe.replace(float("nan"), "-")
|
554 |
|
555 |
-
|
556 |
# if show_average:
|
557 |
# print([row for index, row in filtered_dataframe.iterrows()])
|
558 |
# filtered_dataframe["Average"] = [round(np.mean(row.values()), 2) if "-" not in row.values() else "-" for index, row in filtered_dataframe.iterrows()]
|
@@ -566,6 +587,10 @@ def update_leaderboard(dataframe: pd.DataFrame, selected_task_substrings: List[s
|
|
566 |
|
567 |
return filtered_dataframe
|
568 |
|
|
|
|
|
|
|
|
|
569 |
demo = gr.Blocks(css=custom_css)
|
570 |
with demo:
|
571 |
gr.HTML(TITLE)
|
@@ -581,11 +606,6 @@ with demo:
|
|
581 |
|
582 |
# with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
|
583 |
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
584 |
-
|
585 |
-
# with gr.TabItem("👶 Submit", elem_id="llm-benchmark-tab-table", id=5):
|
586 |
-
# with gr.Column():
|
587 |
-
# with gr.Row():
|
588 |
-
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
589 |
|
590 |
# with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
591 |
# leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
@@ -719,6 +739,115 @@ with demo:
|
|
719 |
"Causal Graph"
|
720 |
)
|
721 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
722 |
# with gr.Row():
|
723 |
# with gr.Accordion("📙 Citation", open=False):
|
724 |
# citation_button = gr.Textbox(
|
|
|
122 |
import pandas as pd
|
123 |
from typing import List, Dict, Optional
|
124 |
from dataclasses import fields
|
125 |
+
import math
|
126 |
|
127 |
class SmartSelectColumns(SelectColumns):
|
128 |
"""
|
|
|
271 |
except Exception:
|
272 |
restart_space()
|
273 |
|
274 |
+
def _sigmoid(x):
|
275 |
+
try:
|
276 |
+
return 1 / (1 + math.exp(-2 * (x-1)))
|
277 |
+
except:
|
278 |
+
return "-"
|
279 |
|
280 |
LEADERBOARD_DF_MIB_SUBGRAPH_FPL = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH)
|
281 |
LEADERBOARD_DF_MIB_SUBGRAPH_FEQ = get_leaderboard_df_mib_subgraph(EVAL_RESULTS_MIB_SUBGRAPH_PATH, EVAL_REQUESTS_PATH, COLS_MIB_SUBGRAPH, BENCHMARK_COLS_MIB_SUBGRAPH,
|
|
|
491 |
gr.Markdown("Upload successful!")
|
492 |
return data
|
493 |
|
494 |
+
def get_hf_username(hf_repo):
|
495 |
+
hf_repo = hf_repo.rstrip("/")
|
496 |
+
parts = hf_repo.split("/")
|
497 |
+
username = parts[-2]
|
498 |
+
return username
|
499 |
+
|
500 |
|
501 |
# Define the preset substrings for filtering
|
502 |
PRESET_SUBSTRINGS = ["IOI", "MCQA", "Arithmetic", "ARC", "GPT-2", "Qwen-2.5", "Gemma-2", "Llama-3.1"]
|
|
|
557 |
else:
|
558 |
show_average = False
|
559 |
|
560 |
+
def _transform_floats(df):
|
561 |
+
df_transformed = df.copy()
|
562 |
+
# Apply transformation row by row
|
563 |
+
for i, row in df_transformed.iterrows():
|
564 |
+
# Apply sigmoid only to numeric values in the row
|
565 |
+
df_transformed.loc[i] = row.apply(lambda x: _sigmoid(x) if isinstance(x, (float, int)) else x)
|
566 |
+
return df_transformed
|
567 |
+
|
568 |
if show_average:
|
569 |
means = filtered_dataframe.replace("-", float("nan")).mean(axis=1, skipna=False)
|
570 |
+
s_filtered_dataframe = _transform_floats(filtered_dataframe)
|
571 |
+
s_means = s_filtered_dataframe.replace("-", float("nan")).mean(axis=1, skipna=False)
|
572 |
filtered_dataframe["Average"] = means.round(2)
|
573 |
+
filtered_dataframe["Score"] = s_means.round(2)
|
574 |
filtered_dataframe = filtered_dataframe.sort_values(by=["Average"], ascending=False, na_position='last')
|
575 |
filtered_dataframe = filtered_dataframe.replace(float("nan"), "-")
|
576 |
|
|
|
577 |
# if show_average:
|
578 |
# print([row for index, row in filtered_dataframe.iterrows()])
|
579 |
# filtered_dataframe["Average"] = [round(np.mean(row.values()), 2) if "-" not in row.values() else "-" for index, row in filtered_dataframe.iterrows()]
|
|
|
587 |
|
588 |
return filtered_dataframe
|
589 |
|
590 |
+
def process_url(url):
|
591 |
+
# Add your URL processing logic here
|
592 |
+
return f"You entered the URL: {url}"
|
593 |
+
|
594 |
demo = gr.Blocks(css=custom_css)
|
595 |
with demo:
|
596 |
gr.HTML(TITLE)
|
|
|
606 |
|
607 |
# with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
|
608 |
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
|
609 |
|
610 |
# with gr.TabItem("Subgraph", elem_id="subgraph", id=0):
|
611 |
# leaderboard = init_leaderboard_mib_subgraph(LEADERBOARD_DF_MIB_SUBGRAPH, "Subgraph")
|
|
|
739 |
"Causal Graph"
|
740 |
)
|
741 |
|
742 |
+
with gr.TabItem("Submit", elem_id="llm-benchmark-tab-table", id=2):
|
743 |
+
gr.Markdown("## 🏆 Submission Portal")
|
744 |
+
|
745 |
+
# Track selection
|
746 |
+
track = gr.Radio(
|
747 |
+
choices=[
|
748 |
+
"Circuit Localization Track",
|
749 |
+
"Causal Variable Localization Track"
|
750 |
+
],
|
751 |
+
label="Select Competition Track",
|
752 |
+
elem_id="track_selector"
|
753 |
+
)
|
754 |
+
|
755 |
+
with gr.Group(visible=False) as circuit_ui:
|
756 |
+
gr.Markdown("### Circuit Localization Requirements")
|
757 |
+
hf_repo = gr.Textbox(
|
758 |
+
label="HuggingFace Repository URL",
|
759 |
+
placeholder="https://huggingface.co/username/repo/tree/main/path",
|
760 |
+
info="Must be a valid HuggingFace URL pointing to a folder with 10 circuit files (.json or .pt)"
|
761 |
+
)
|
762 |
+
|
763 |
+
with gr.Group(visible=False) as causal_ui:
|
764 |
+
gr.Markdown("### Causal Variable Localization Requirements")
|
765 |
+
with gr.Row():
|
766 |
+
layer = gr.Number(
|
767 |
+
label="Layer Number",
|
768 |
+
precision=0,
|
769 |
+
minimum=0,
|
770 |
+
info="Integer specifying the model layer"
|
771 |
+
)
|
772 |
+
token_position = gr.Number(
|
773 |
+
label="Token Position",
|
774 |
+
precision=0,
|
775 |
+
minimum=0,
|
776 |
+
info="Integer specifying token position"
|
777 |
+
)
|
778 |
+
code_upload = gr.File(
|
779 |
+
label="Upload Python file implementing your featurization function",
|
780 |
+
file_types=[".py"],
|
781 |
+
)
|
782 |
+
|
783 |
+
# Common fields
|
784 |
+
with gr.Group():
|
785 |
+
gr.Markdown("### Team Information")
|
786 |
+
team_name = gr.Textbox(label="Team Name")
|
787 |
+
contact_email = gr.Textbox(label="Contact Email")
|
788 |
+
|
789 |
+
# Dynamic UI logic
|
790 |
+
def toggle_ui(track):
|
791 |
+
circuit = track == "Circuit Localization Track"
|
792 |
+
causal = not circuit
|
793 |
+
return {
|
794 |
+
circuit_ui: gr.Group(visible=circuit),
|
795 |
+
causal_ui: gr.Group(visible=causal)
|
796 |
+
}
|
797 |
+
|
798 |
+
track.change(toggle_ui, track, [circuit_ui, causal_ui])
|
799 |
+
|
800 |
+
# Submission handling
|
801 |
+
status = gr.Textbox(label="Submission Status", visible=False)
|
802 |
+
|
803 |
+
def handle_submission(track, hf_repo, layer, token_position, code_upload, team_name, contact_email):
|
804 |
+
errors = []
|
805 |
+
|
806 |
+
# Validate common fields
|
807 |
+
if not team_name.strip():
|
808 |
+
errors.append("Team name is required")
|
809 |
+
if "@" not in contact_email or "." not in contact_email:
|
810 |
+
errors.append("Valid email address is required")
|
811 |
+
|
812 |
+
# Track-specific validation
|
813 |
+
if "Circuit" in track:
|
814 |
+
if not hf_repo.startswith("https://huggingface.co/"):
|
815 |
+
errors.append("Invalid HuggingFace URL - must start with https://huggingface.co/")
|
816 |
+
else:
|
817 |
+
# Check rate limit only for valid HF submissions
|
818 |
+
username = get_hf_username(hf_repo)
|
819 |
+
rate = 0 # TODO: check submissions queue for rates
|
820 |
+
rate_limit = 2
|
821 |
+
if rate > rate_limit:
|
822 |
+
errors.append("Rate limit exceeded (max 2 submissions per week per HF account)")
|
823 |
+
|
824 |
+
else:
|
825 |
+
if not (isinstance(layer, int) and isinstance(token_position, int)):
|
826 |
+
errors.append("Layer and token position must be integers")
|
827 |
+
if not code_upload:
|
828 |
+
errors.append("Code file upload is required")
|
829 |
+
|
830 |
+
if errors:
|
831 |
+
return gr.Textbox("\n".join(f"❌ {e}" for e in errors), visible=True)
|
832 |
+
|
833 |
+
# Process valid submission
|
834 |
+
return gr.Textbox("✅ Submission received! Thank you for your entry.", visible=True)
|
835 |
+
|
836 |
+
submit_btn = gr.Button("Submit Entry", variant="primary")
|
837 |
+
submit_btn.click(
|
838 |
+
handle_submission,
|
839 |
+
inputs=[track, hf_repo, layer, token_position, code_upload, team_name, contact_email],
|
840 |
+
outputs=status
|
841 |
+
)
|
842 |
+
|
843 |
+
# Add info about rate limits
|
844 |
+
gr.Markdown("""
|
845 |
+
### Submission Policy
|
846 |
+
- Maximum 2 valid submissions per HuggingFace account per week
|
847 |
+
- Invalid submissions don't count toward your limit
|
848 |
+
- Rate limit tracked on a rolling basis: a submission no longer counts toward the limit as soon as 7 days have passed since the submission time
|
849 |
+
""")
|
850 |
+
|
851 |
# with gr.Row():
|
852 |
# with gr.Accordion("📙 Citation", open=False):
|
853 |
# citation_button = gr.Textbox(
|
caulsal_metric.py
CHANGED
@@ -161,7 +161,7 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
|
161 |
numeric_df = df_copy.applymap(extract_score)
|
162 |
|
163 |
# Group by base method name and take the mean
|
164 |
-
aggregated_df = numeric_df.groupby(level=0).max().round(
|
165 |
|
166 |
# Convert back to string format
|
167 |
aggregated_df = aggregated_df.applymap(lambda x: f"{x:.3f}")
|
@@ -198,12 +198,12 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
198 |
|
199 |
# Create new DataFrame with averaged intervention scores
|
200 |
averaged_df = pd.DataFrame({
|
201 |
-
model_task: numeric_df[cols].mean(axis=1).round(
|
202 |
for model_task, cols in model_task_groups.items()
|
203 |
})
|
204 |
|
205 |
# Add overall average column
|
206 |
-
averaged_df['Average'] = averaged_df.mean(axis=1).round(
|
207 |
|
208 |
# Sort by Average column
|
209 |
averaged_df = averaged_df.sort_values('Average', ascending=False)
|
|
|
161 |
numeric_df = df_copy.applymap(extract_score)
|
162 |
|
163 |
# Group by base method name and take the mean
|
164 |
+
aggregated_df = numeric_df.groupby(level=0).max().round(2)
|
165 |
|
166 |
# Convert back to string format
|
167 |
aggregated_df = aggregated_df.applymap(lambda x: f"{x:.3f}")
|
|
|
198 |
|
199 |
# Create new DataFrame with averaged intervention scores
|
200 |
averaged_df = pd.DataFrame({
|
201 |
+
model_task: numeric_df[cols].mean(axis=1).round(2)
|
202 |
for model_task, cols in model_task_groups.items()
|
203 |
})
|
204 |
|
205 |
# Add overall average column
|
206 |
+
averaged_df['Average'] = averaged_df.mean(axis=1).round(2)
|
207 |
|
208 |
# Sort by Average column
|
209 |
averaged_df = averaged_df.sort_values('Average', ascending=False)
|
src/about.py
CHANGED
@@ -139,20 +139,20 @@ This leaderboard displays scores from the 2024 BabyLM Challenge. Each track has
|
|
139 |
"""
|
140 |
|
141 |
EVALUATION_QUEUE_TEXT = """
|
142 |
-
##
|
143 |
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
```
|
150 |
-
If this step fails, follow the error messages to debug your predictions before getting in touch. It's likely that either (i) some results are missing, or (ii) the results are incorrectly formatted.
|
151 |
|
152 |
-
|
153 |
|
154 |
-
Once
|
155 |
-
|
|
|
|
|
156 |
"""
|
157 |
|
158 |
CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
|
|
|
139 |
"""
|
140 |
|
141 |
EVALUATION_QUEUE_TEXT = """
|
142 |
+
## Circuit localization track:
|
143 |
|
144 |
+
You'll need 10 circuits per task/model combination. For each critical threshold k and previous threshold k_-1,
|
145 |
+
the circuit should contain no fewer than k_-1% of components, and no more than k% of components. Create a HuggingFace
|
146 |
+
dataset or model repository; this will house your circuits. Make a folder where the circuits (and *only* the circuits)
|
147 |
+
are contained. Do not worry about the ordering of the files; our evaluation script will read the circuits and sort them
|
148 |
+
by size. Provide a link to this folder below.
|
|
|
|
|
149 |
|
150 |
+
For specifications about the file format for a circuit, see the README on our project GitHub: TODO
|
151 |
|
152 |
+
Once your model makes it to the front of the evaluation queue, we'll submit your model for evaluation on the private test set.
|
153 |
+
The evaluations are handled by the National Deep Inference Framework (NDIF).
|
154 |
+
|
155 |
+
## Causal variable localization track:
|
156 |
"""
|
157 |
|
158 |
CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
|
src/leaderboard/read_evals.py
CHANGED
@@ -86,7 +86,19 @@ class EvalResult_MIB_SUBGRAPH:
|
|
86 |
results=results
|
87 |
)
|
88 |
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
def to_dict(self, metric_type="F+"):
|
92 |
"""Converts the Eval Result to a dict for dataframe display"""
|
@@ -105,6 +117,7 @@ class EvalResult_MIB_SUBGRAPH:
|
|
105 |
data_dict[f"{task.value.benchmark}_{model}"] = '-'
|
106 |
|
107 |
all_scores = []
|
|
|
108 |
for task, task_results in self.results.items():
|
109 |
for model, metrics in task_results.items():
|
110 |
col_name = f"{task}_{model}"
|
@@ -124,8 +137,10 @@ class EvalResult_MIB_SUBGRAPH:
|
|
124 |
score = area_under if metric_type == "F+" else area_from_100
|
125 |
data_dict[col_name] = round(score, 2)
|
126 |
all_scores.append(score)
|
|
|
127 |
|
128 |
data_dict["Average"] = round(np.mean(all_scores), 2) if '-' not in data_dict.values() else '-'
|
|
|
129 |
return data_dict
|
130 |
|
131 |
|
@@ -294,7 +309,7 @@ class EvalResult_MIB_CAUSALGRAPH:
|
|
294 |
|
295 |
# Initialize results dictionary
|
296 |
results = {}
|
297 |
-
for task in ["MCQA"]:
|
298 |
results[task] = {}
|
299 |
|
300 |
# Process each model's results
|
@@ -309,7 +324,7 @@ class EvalResult_MIB_CAUSALGRAPH:
|
|
309 |
for intervention_data in layer_data['layer_scores']:
|
310 |
# Calculate average score for counterfactuals
|
311 |
avg_cf_score = np.mean([
|
312 |
-
cf['score']
|
313 |
for cf in intervention_data['counterfactual_scores']
|
314 |
])
|
315 |
|
@@ -416,7 +431,7 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
|
416 |
df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) and not pd.isna(x) else x)
|
417 |
|
418 |
# Group by base method name and take the max, handling NaN values
|
419 |
-
aggregated_df = df_copy.groupby('base_method')[score_columns].agg(lambda x: np.nanmax(x)).round(
|
420 |
|
421 |
# Convert back to string format and reset index
|
422 |
aggregated_df = aggregated_df.reset_index()
|
@@ -460,8 +475,8 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
460 |
for _, row in df_copy.iterrows():
|
461 |
averaged_row = {'Method': row['Method']}
|
462 |
for model_task, cols in model_task_groups.items():
|
463 |
-
averaged_row[model_task] = np.mean([row[col] for col in cols]).round(
|
464 |
-
averaged_row['Average'] = np.mean([averaged_row[mt] for mt in model_task_groups.keys()]).round(
|
465 |
averaged_data.append(averaged_row)
|
466 |
|
467 |
averaged_df = pd.DataFrame(averaged_data)
|
|
|
86 |
results=results
|
87 |
)
|
88 |
|
89 |
+
def _sigmoid(self, x):
|
90 |
+
try:
|
91 |
+
return 1 / (1 + math.exp(-2 * (x-1)))
|
92 |
+
except:
|
93 |
+
return "-"
|
94 |
+
|
95 |
+
def _transform_floats(self, df):
|
96 |
+
df_transformed = df.copy()
|
97 |
+
# Apply transformation row by row
|
98 |
+
for i, row in df_transformed.iterrows():
|
99 |
+
# Apply sigmoid only to numeric values in the row
|
100 |
+
df_transformed.loc[i] = row.apply(lambda x: self._sigmoid(x) if isinstance(x, (float, int)) else x)
|
101 |
+
return df_transformed
|
102 |
|
103 |
def to_dict(self, metric_type="F+"):
|
104 |
"""Converts the Eval Result to a dict for dataframe display"""
|
|
|
117 |
data_dict[f"{task.value.benchmark}_{model}"] = '-'
|
118 |
|
119 |
all_scores = []
|
120 |
+
transformed_scores = []
|
121 |
for task, task_results in self.results.items():
|
122 |
for model, metrics in task_results.items():
|
123 |
col_name = f"{task}_{model}"
|
|
|
137 |
score = area_under if metric_type == "F+" else area_from_100
|
138 |
data_dict[col_name] = round(score, 2)
|
139 |
all_scores.append(score)
|
140 |
+
transformed_scores.append(self._sigmoid(score))
|
141 |
|
142 |
data_dict["Average"] = round(np.mean(all_scores), 2) if '-' not in data_dict.values() else '-'
|
143 |
+
data_dict["Score"] = round(np.mean(transformed_scores), 2) if '-' not in data_dict.values() else '-'
|
144 |
return data_dict
|
145 |
|
146 |
|
|
|
309 |
|
310 |
# Initialize results dictionary
|
311 |
results = {}
|
312 |
+
for task in ["IOI", "MCQA", "arithmetic", "ARC-easy"]:
|
313 |
results[task] = {}
|
314 |
|
315 |
# Process each model's results
|
|
|
324 |
for intervention_data in layer_data['layer_scores']:
|
325 |
# Calculate average score for counterfactuals
|
326 |
avg_cf_score = np.mean([
|
327 |
+
cf['score'] if 'score' in cf else 0
|
328 |
for cf in intervention_data['counterfactual_scores']
|
329 |
])
|
330 |
|
|
|
431 |
df_copy[col] = df_copy[col].apply(lambda x: float(x) if isinstance(x, str) and not pd.isna(x) else x)
|
432 |
|
433 |
# Group by base method name and take the max, handling NaN values
|
434 |
+
aggregated_df = df_copy.groupby('base_method')[score_columns].agg(lambda x: np.nanmax(x)).round(2)
|
435 |
|
436 |
# Convert back to string format and reset index
|
437 |
aggregated_df = aggregated_df.reset_index()
|
|
|
475 |
for _, row in df_copy.iterrows():
|
476 |
averaged_row = {'Method': row['Method']}
|
477 |
for model_task, cols in model_task_groups.items():
|
478 |
+
averaged_row[model_task] = np.mean([row[col] for col in cols]).round(2)
|
479 |
+
averaged_row['Average'] = np.mean([averaged_row[mt] for mt in model_task_groups.keys()]).round(2)
|
480 |
averaged_data.append(averaged_row)
|
481 |
|
482 |
averaged_df = pd.DataFrame(averaged_data)
|
src/populate.py
CHANGED
@@ -77,7 +77,7 @@ def aggregate_methods(df: pd.DataFrame) -> pd.DataFrame:
|
|
77 |
numeric_df = df_copy.select_dtypes(include=['float64', 'int64'])
|
78 |
|
79 |
# Group by base method name and take the max
|
80 |
-
aggregated_df = numeric_df.groupby(level=0).max().round(
|
81 |
|
82 |
# Reset index to get Method as a column
|
83 |
aggregated_df.reset_index(inplace=True)
|
@@ -116,7 +116,7 @@ def create_intervention_averaged_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
116 |
averaged_df['Method'] = method_col
|
117 |
|
118 |
for col_name, cols in result_cols.items():
|
119 |
-
averaged_df[col_name] = df_copy[cols].mean(axis=1).round(
|
120 |
|
121 |
return averaged_df
|
122 |
|
|
|
77 |
numeric_df = df_copy.select_dtypes(include=['float64', 'int64'])
|
78 |
|
79 |
# Group by base method name and take the max
|
80 |
+
aggregated_df = numeric_df.groupby(level=0).max().round(2)
|
81 |
|
82 |
# Reset index to get Method as a column
|
83 |
aggregated_df.reset_index(inplace=True)
|
|
|
116 |
averaged_df['Method'] = method_col
|
117 |
|
118 |
for col_name, cols in result_cols.items():
|
119 |
+
averaged_df[col_name] = df_copy[cols].mean(axis=1).round(2)
|
120 |
|
121 |
return averaged_df
|
122 |
|