loodvanniekerkginkgo commited on
Commit
61fa714
·
1 Parent(s): 22f82e7

Text changes, only one leaderboard

Browse files
Files changed (3) hide show
  1. about.py +8 -8
  2. app.py +35 -24
  3. constants.py +14 -1
about.py CHANGED
@@ -1,4 +1,4 @@
1
- from constants import ABOUT_TAB_NAME, ASSAY_LIST, TERMS_URL
2
 
3
  ABOUT_INTRO = f"""
4
  ## About this challenge
@@ -17,7 +17,7 @@ For each of the 5 properties in the competition, there is a prize for the model
17
  There is also an 'open-source' prize for the best model trained on the GDPa1 dataset (reporting cross-validation results) and assessed on the private test set where authors provide all training code and data.
18
  For each of these 6 prizes, participants have the choice between **$10k in data generation credits** with [Ginkgo Datapoints](https://datapoints.ginkgo.bio/) or a **cash prize** with a value of $2000.
19
 
20
- See the FAQs below or the [competition terms]({TERMS_URL}) for more details.
21
  """
22
 
23
  ABOUT_TEXT = f"""
@@ -27,10 +27,10 @@ ABOUT_TEXT = f"""
27
  1. **Create a Hugging Face account** [here](https://huggingface.co/join) if you don't have one yet (this is used to track unique submissions and to access the GDPa1 dataset).
28
  2. **Register your team** on the [Competition Registration](https://datapoints.ginkgo.bio/ai-competitions/2025-abdev-competition) page.
29
  3. **Build a model** or validate it on the [GDPa1](https://huggingface.co/datasets/ginkgo-datapoints/GDPa1) dataset.
30
- 4. **Choose a validation track**. You must first upload predictions on the validation set before submitting predictions on the private test set:
31
- - **Track 1**: If you already have a developability model, you can submit your predictions for the GDPa1 public dataset.
32
- - **Track 2**: If you don't have a model, train one using cross-validation on the GDPa1 dataset and submit your predictions under the "Cross-validation" option.
33
- 5. **Submit your predictions** as a CSV on the "✉️ Submit" tab on this page (you are currently on the "{ABOUT_TAB_NAME}" tab).
34
 
35
  #### How to contribute?
36
 
@@ -126,11 +126,11 @@ You do **not** need to predict all 5 properties — each property has its own le
126
  1. **Submit your predictions** as a CSV with `antibody_name` + one column per property you are predicting (e.g. `"antibody_name,Titer,PR_CHO"` if your model predicts Titer and Polyreactivity).
127
  2. **Final test submission**: Download test sequences from the example files below and upload predictions.
128
 
129
- The validation set results should appear on the leaderboard after a few seconds. The **private test set results will not appear on the leaderboards**, and will be used to determine the winners at the close of the competition.
130
  We may release private test set results at intermediate points during the competition.
131
 
132
  ## Cross-validation
133
 
134
  For the cross-validation metrics (if training only on the GDPa1 dataset), use the `"hierarchical_cluster_IgG_isotype_stratified_fold"` column to split the dataset into folds and make predictions for each of the folds.
135
  Submit a CSV file in the same format but also containing the `"hierarchical_cluster_IgG_isotype_stratified_fold"` column.
136
- """
 
1
+ from constants import ABOUT_TAB_NAME, ASSAY_LIST, SUBMIT_TAB_NAME, TERMS_URL, FAQ_TAB_NAME
2
 
3
  ABOUT_INTRO = f"""
4
  ## About this challenge
 
17
  There is also an 'open-source' prize for the best model trained on the GDPa1 dataset (reporting cross-validation results) and assessed on the private test set where authors provide all training code and data.
18
  For each of these 6 prizes, participants have the choice between **$10k in data generation credits** with [Ginkgo Datapoints](https://datapoints.ginkgo.bio/) or a **cash prize** with a value of $2000.
19
 
20
+ See the "{FAQ_TAB_NAME}" tab above (you are currently on the "{ABOUT_TAB_NAME}" tab) or the [competition terms]({TERMS_URL}) for more details.
21
  """
22
 
23
  ABOUT_TEXT = f"""
 
27
  1. **Create a Hugging Face account** [here](https://huggingface.co/join) if you don't have one yet (this is used to track unique submissions and to access the GDPa1 dataset).
28
  2. **Register your team** on the [Competition Registration](https://datapoints.ginkgo.bio/ai-competitions/2025-abdev-competition) page.
29
  3. **Build a model** or validate it on the [GDPa1](https://huggingface.co/datasets/ginkgo-datapoints/GDPa1) dataset.
30
+ 4. **Complete the "Qualifying Exam"**. Before you can submit to the final test set, you must first get a score on the public leaderboard. Choose one of the two tracks:
31
+ - Track 1 (Benchmark an existing model): Submit predictions for the `GDPa1` dataset.
32
+ - Track 2 (Train from scratch): Train a model using cross-validation on the `GDPa1` dataset and submit cross-validation predictions by selecting `GDPa1_cross_validation`.
33
+ 5. **Submit to the "Final Exam"**. Once you have submitted predictions on the validation set, download the private test set sequences from the {SUBMIT_TAB_NAME} tab and submit your final predictions. Your performance on this private set will determine the winners.
34
 
35
  #### How to contribute?
36
 
 
126
  1. **Submit your predictions** as a CSV with `antibody_name` + one column per property you are predicting (e.g. `"antibody_name,Titer,PR_CHO"` if your model predicts Titer and Polyreactivity).
127
  2. **Final test submission**: Download test sequences from the example files below and upload predictions.
128
 
129
+ The validation set results should appear on the leaderboard within a minute. The **private test set results will not appear on the leaderboards**, and will be used to determine the winners at the close of the competition.
130
  We may release private test set results at intermediate points during the competition.
131
 
132
  ## Cross-validation
133
 
134
  For the cross-validation metrics (if training only on the GDPa1 dataset), use the `"hierarchical_cluster_IgG_isotype_stratified_fold"` column to split the dataset into folds and make predictions for each of the folds.
135
  Submit a CSV file in the same format but also containing the `"hierarchical_cluster_IgG_isotype_stratified_fold"` column.
136
+ """
app.py CHANGED
@@ -3,19 +3,20 @@ import pandas as pd
3
  import gradio as gr
4
  from gradio_leaderboard import Leaderboard
5
 
6
- from utils import fetch_hf_results, show_output_box
7
  from constants import (
8
- ASSAY_LIST,
9
- ASSAY_RENAME,
10
- ASSAY_EMOJIS,
11
- ASSAY_DESCRIPTION,
12
  EXAMPLE_FILE_DICT,
13
  LEADERBOARD_DISPLAY_COLUMNS,
14
  ABOUT_TAB_NAME,
 
15
  TERMS_URL,
 
 
 
16
  )
17
- from about import ABOUT_INTRO, ABOUT_TEXT, FAQS, SUBMIT_INTRUCTIONS
18
  from submit import make_submission
 
19
 
20
  def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
21
  df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
@@ -28,6 +29,9 @@ def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None)
28
  # Convert spearman column to string to avoid dtype incompatibility when assigning text
29
  df["spearman"] = df["spearman"].astype(str)
30
  df.loc[(df["dataset"] == "Heldout Test Set") & (df["spearman"] == "nan"), "spearman"] = "N/A, evaluated at competition close"
 
 
 
31
  return df
32
 
33
 
@@ -41,9 +45,9 @@ def get_leaderboard_object(assay: str | None = None):
41
  lb = Leaderboard(
42
  value=format_leaderboard_table(df_results=current_dataframe, assay=assay),
43
  datatype=["str", "str", "str", "number"],
44
- select_columns=["model", "property", "spearman", "dataset"],
45
- search_columns=["model"],
46
- filter_columns=filter_columns,
47
  every=15,
48
  render=True,
49
  )
@@ -100,23 +104,24 @@ with gr.Blocks() as demo:
100
  gr.Markdown(ABOUT_TEXT)
101
 
102
  # Procedurally make these 5 tabs
103
- for i, assay in enumerate(ASSAY_LIST):
104
- with gr.TabItem(
105
- f"{ASSAY_EMOJIS[assay]} {ASSAY_RENAME[assay]}",
106
- elem_id="abdev-benchmark-tab-table",
107
- ) as tab_item:
108
- gr.Markdown(f"# {ASSAY_DESCRIPTION[assay]}")
109
- lb = get_leaderboard_object(assay=assay)
110
 
111
- def refresh_leaderboard(assay=assay):
112
- return format_leaderboard_table(df_results=current_dataframe, assay=assay)
113
 
114
- # Refresh when data version changes
115
- data_version.change(fn=refresh_leaderboard, outputs=lb)
116
 
117
- with gr.TabItem("🚀 Overall", elem_id="abdev-benchmark-tab-table") as overall_tab:
 
118
  gr.Markdown(
119
- "# Antibody Developability Benchmark Leaderboard over all properties"
120
  )
121
  lb = get_leaderboard_object()
122
 
@@ -125,8 +130,13 @@ with gr.Blocks() as demo:
125
 
126
  # Refresh when data version changes
127
  data_version.change(fn=refresh_overall_leaderboard, outputs=lb)
 
 
 
 
 
128
 
129
- with gr.TabItem("✉️ Submit", elem_id="boundary-benchmark-tab-table"):
130
  gr.Markdown(SUBMIT_INTRUCTIONS)
131
  submission_type_state = gr.State(value="GDPa1")
132
  download_file_state = gr.State(value=EXAMPLE_FILE_DICT["GDPa1"])
@@ -164,6 +174,7 @@ with gr.Blocks() as demo:
164
  choices=["GDPa1", "GDPa1_cross_validation", "Heldout Test Set"],
165
  value="GDPa1",
166
  label="Submission Type",
 
167
  )
168
  download_button = gr.DownloadButton(
169
  label="📥 Download example submission CSV for GDPa1",
@@ -225,7 +236,7 @@ with gr.Blocks() as demo:
225
  inputs=[message],
226
  outputs=[message],
227
  )
228
- with gr.Tab("❔ FAQs"):
229
  gr.Markdown("# Frequently Asked Questions")
230
  for i, (question, answer) in enumerate(FAQS.items()):
231
  # Would love to make questions bold but accordion doesn't support it
 
3
  import gradio as gr
4
  from gradio_leaderboard import Leaderboard
5
 
6
+ from about import ABOUT_INTRO, ABOUT_TEXT, FAQS, SUBMIT_INTRUCTIONS
7
  from constants import (
8
+ ASSAY_RENAME, # keep this: used in df query
 
 
 
9
  EXAMPLE_FILE_DICT,
10
  LEADERBOARD_DISPLAY_COLUMNS,
11
  ABOUT_TAB_NAME,
12
+ FAQ_TAB_NAME,
13
  TERMS_URL,
14
+ LEADERBOARD_COLUMNS_RENAME,
15
+ LEADERBOARD_COLUMNS_RENAME_LIST,
16
+ SUBMIT_TAB_NAME,
17
  )
 
18
  from submit import make_submission
19
+ from utils import fetch_hf_results, show_output_box
20
 
21
  def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
22
  df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
 
29
  # Convert spearman column to string to avoid dtype incompatibility when assigning text
30
  df["spearman"] = df["spearman"].astype(str)
31
  df.loc[(df["dataset"] == "Heldout Test Set") & (df["spearman"] == "nan"), "spearman"] = "N/A, evaluated at competition close"
32
+
33
+ # Finally, rename columns for readability
34
+ df = df.rename(columns=LEADERBOARD_COLUMNS_RENAME)
35
  return df
36
 
37
 
 
45
  lb = Leaderboard(
46
  value=format_leaderboard_table(df_results=current_dataframe, assay=assay),
47
  datatype=["str", "str", "str", "number"],
48
+ select_columns=LEADERBOARD_COLUMNS_RENAME_LIST(["model", "property", "spearman", "dataset"]),
49
+ search_columns=["Model Name"],
50
+ filter_columns=LEADERBOARD_COLUMNS_RENAME_LIST(filter_columns),
51
  every=15,
52
  render=True,
53
  )
 
104
  gr.Markdown(ABOUT_TEXT)
105
 
106
  # Procedurally make these 5 tabs
107
+ # for i, assay in enumerate(ASSAY_LIST):
108
+ # with gr.TabItem(
109
+ # f"{ASSAY_EMOJIS[assay]} {ASSAY_RENAME[assay]}",
110
+ # elem_id="abdev-benchmark-tab-table",
111
+ # ) as tab_item:
112
+ # gr.Markdown(f"# {ASSAY_DESCRIPTION[assay]}")
113
+ # lb = get_leaderboard_object(assay=assay)
114
 
115
+ # def refresh_leaderboard(assay=assay):
116
+ # return format_leaderboard_table(df_results=current_dataframe, assay=assay)
117
 
118
+ # # Refresh when data version changes
119
+ # data_version.change(fn=refresh_leaderboard, outputs=lb)
120
 
121
+ # Note(Lood): Trying out just one leaderboard. We could also have a dropdown here that shows different leaderboards for each property, but that's just the same as the filters
122
+ with gr.TabItem("🏆 Leaderboard", elem_id="abdev-benchmark-tab-table") as leaderboard_tab:
123
  gr.Markdown(
124
+ "# Overall Leaderboard (filter below by property)" # TODO add details about the 6 prizes here
125
  )
126
  lb = get_leaderboard_object()
127
 
 
130
 
131
  # Refresh when data version changes
132
  data_version.change(fn=refresh_overall_leaderboard, outputs=lb)
133
+
134
+ # At the bottom of the leaderboard, we can keep as NaN and explain missing test set results
135
+ # gr.Markdown(
136
+ # "_ℹ️ Results for the private test set will not be shown here and will be used for final judging at the close of the competition._"
137
+ # )
138
 
139
+ with gr.TabItem(SUBMIT_TAB_NAME, elem_id="boundary-benchmark-tab-table"):
140
  gr.Markdown(SUBMIT_INTRUCTIONS)
141
  submission_type_state = gr.State(value="GDPa1")
142
  download_file_state = gr.State(value=EXAMPLE_FILE_DICT["GDPa1"])
 
174
  choices=["GDPa1", "GDPa1_cross_validation", "Heldout Test Set"],
175
  value="GDPa1",
176
  label="Submission Type",
177
+ info=f"Choose the dataset corresponding to the track you're participating in. See the '{ABOUT_TAB_NAME}' tab for details.",
178
  )
179
  download_button = gr.DownloadButton(
180
  label="📥 Download example submission CSV for GDPa1",
 
236
  inputs=[message],
237
  outputs=[message],
238
  )
239
+ with gr.Tab(FAQ_TAB_NAME):
240
  gr.Markdown("# Frequently Asked Questions")
241
  for i, (question, answer) in enumerate(FAQS.items()):
242
  # Would love to make questions bold but accordion doesn't support it
constants.py CHANGED
@@ -28,7 +28,10 @@ ASSAY_EMOJIS = {
28
  "Tm2": "🌡️",
29
  "Titer": "🧪",
30
  }
31
- ABOUT_TAB_NAME = "ℹ️ About"
 
 
 
32
 
33
  REGISTRATION_CODE = "GINKGO-ABDEV-2025"
34
  TERMS_URL = "https://euphsfcyogalqiqsawbo.supabase.co/storage/v1/object/public/gdpweb/pdfs/2025%20Ginkgo%20Antibody%20Developability%20Prediction%20Competition%202025-08-28-v2.pdf"
@@ -68,3 +71,13 @@ RESULTS_REPO = f"{ORGANIZATION}/abdev-bench-results"
68
  # Leaderboard dataframes
69
  LEADERBOARD_RESULTS_COLUMNS = ["model", "assay", "spearman", "dataset", "user", "submission_time"] # The columns expected from the results dataset
70
  LEADERBOARD_DISPLAY_COLUMNS = ["model", "property", "spearman", "dataset", "user", "submission_time"] # After changing assay to property (pretty formatting)
 
 
 
 
 
 
 
 
 
 
 
28
  "Tm2": "🌡️",
29
  "Titer": "🧪",
30
  }
31
+ # Tabs with emojis
32
+ ABOUT_TAB_NAME = "📖 About / Rules"
33
+ FAQ_TAB_NAME = "❓ FAQs"
34
+ SUBMIT_TAB_NAME = "✉️ Submit"
35
 
36
  REGISTRATION_CODE = "GINKGO-ABDEV-2025"
37
  TERMS_URL = "https://euphsfcyogalqiqsawbo.supabase.co/storage/v1/object/public/gdpweb/pdfs/2025%20Ginkgo%20Antibody%20Developability%20Prediction%20Competition%202025-08-28-v2.pdf"
 
71
  # Leaderboard dataframes
72
  LEADERBOARD_RESULTS_COLUMNS = ["model", "assay", "spearman", "dataset", "user", "submission_time"] # The columns expected from the results dataset
73
  LEADERBOARD_DISPLAY_COLUMNS = ["model", "property", "spearman", "dataset", "user", "submission_time"] # After changing assay to property (pretty formatting)
74
+ LEADERBOARD_COLUMNS_RENAME = {
75
+ "spearman": "Spearman Correlation",
76
+ "dataset": "Dataset",
77
+ "user": "User",
78
+ "submission_time": "Submission Time",
79
+ "model": "Model Name",
80
+ "property": "Property",
81
+ }
82
+ def LEADERBOARD_COLUMNS_RENAME_LIST(columns: list[str]) -> list[str]:
83
+ return list(map(lambda x: LEADERBOARD_COLUMNS_RENAME.get(x,x), columns))