Spaces:
Sleeping
Sleeping
first construction
Browse files- app.py +35 -142
- results.csv +3 -0
- src/about.py +36 -7
- src/display/utils.py +7 -14
app.py
CHANGED
@@ -10,6 +10,7 @@ from src.about import (
|
|
10 |
EVALUATION_QUEUE_TEXT,
|
11 |
INTRODUCTION_TEXT,
|
12 |
LLM_BENCHMARKS_TEXT,
|
|
|
13 |
TITLE,
|
14 |
)
|
15 |
from src.display.css_html_js import custom_css
|
@@ -32,62 +33,8 @@ from src.submission.submit import add_new_eval
|
|
32 |
def restart_space():
|
33 |
API.restart_space(repo_id=REPO_ID)
|
34 |
|
35 |
-
### Space initialisation
|
36 |
-
try:
|
37 |
-
print(EVAL_REQUESTS_PATH)
|
38 |
-
snapshot_download(
|
39 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
40 |
-
)
|
41 |
-
except Exception:
|
42 |
-
restart_space()
|
43 |
-
try:
|
44 |
-
print(EVAL_RESULTS_PATH)
|
45 |
-
snapshot_download(
|
46 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
47 |
-
)
|
48 |
-
except Exception:
|
49 |
-
restart_space()
|
50 |
-
|
51 |
-
|
52 |
-
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
-
|
54 |
-
(
|
55 |
-
finished_eval_queue_df,
|
56 |
-
running_eval_queue_df,
|
57 |
-
pending_eval_queue_df,
|
58 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
-
|
60 |
-
def init_leaderboard(dataframe):
|
61 |
-
if dataframe is None or dataframe.empty:
|
62 |
-
raise ValueError("Leaderboard DataFrame is empty or None.")
|
63 |
-
return Leaderboard(
|
64 |
-
value=dataframe,
|
65 |
-
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
-
select_columns=SelectColumns(
|
67 |
-
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
68 |
-
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
-
label="Select Columns to Display:",
|
70 |
-
),
|
71 |
-
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
72 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
-
filter_columns=[
|
74 |
-
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
-
ColumnFilter(
|
77 |
-
AutoEvalColumn.params.name,
|
78 |
-
type="slider",
|
79 |
-
min=0.01,
|
80 |
-
max=150,
|
81 |
-
label="Select the number of parameters (B)",
|
82 |
-
),
|
83 |
-
ColumnFilter(
|
84 |
-
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
85 |
-
),
|
86 |
-
],
|
87 |
-
bool_checkboxgroup_label="Hide models",
|
88 |
-
interactive=False,
|
89 |
-
)
|
90 |
|
|
|
91 |
|
92 |
demo = gr.Blocks(css=custom_css)
|
93 |
with demo:
|
@@ -95,98 +42,44 @@ with demo:
|
|
95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
|
97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
-
with gr.TabItem("🏅
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
102 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
103 |
|
104 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
105 |
-
|
106 |
-
with gr.Row():
|
107 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
108 |
-
|
109 |
-
with gr.Column():
|
110 |
-
with gr.Accordion(
|
111 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
112 |
-
open=False,
|
113 |
-
):
|
114 |
-
with gr.Row():
|
115 |
-
finished_eval_table = gr.components.Dataframe(
|
116 |
-
value=finished_eval_queue_df,
|
117 |
-
headers=EVAL_COLS,
|
118 |
-
datatype=EVAL_TYPES,
|
119 |
-
row_count=5,
|
120 |
-
)
|
121 |
-
with gr.Accordion(
|
122 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
123 |
-
open=False,
|
124 |
-
):
|
125 |
-
with gr.Row():
|
126 |
-
running_eval_table = gr.components.Dataframe(
|
127 |
-
value=running_eval_queue_df,
|
128 |
-
headers=EVAL_COLS,
|
129 |
-
datatype=EVAL_TYPES,
|
130 |
-
row_count=5,
|
131 |
-
)
|
132 |
-
|
133 |
-
with gr.Accordion(
|
134 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
135 |
-
open=False,
|
136 |
-
):
|
137 |
-
with gr.Row():
|
138 |
-
pending_eval_table = gr.components.Dataframe(
|
139 |
-
value=pending_eval_queue_df,
|
140 |
-
headers=EVAL_COLS,
|
141 |
-
datatype=EVAL_TYPES,
|
142 |
-
row_count=5,
|
143 |
-
)
|
144 |
-
with gr.Row():
|
145 |
-
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
146 |
-
|
147 |
-
with gr.Row():
|
148 |
-
with gr.Column():
|
149 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
150 |
-
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
151 |
-
model_type = gr.Dropdown(
|
152 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
153 |
-
label="Model type",
|
154 |
-
multiselect=False,
|
155 |
-
value=None,
|
156 |
-
interactive=True,
|
157 |
-
)
|
158 |
-
|
159 |
-
with gr.Column():
|
160 |
-
precision = gr.Dropdown(
|
161 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
162 |
-
label="Precision",
|
163 |
-
multiselect=False,
|
164 |
-
value="float16",
|
165 |
-
interactive=True,
|
166 |
-
)
|
167 |
-
weight_type = gr.Dropdown(
|
168 |
-
choices=[i.value.name for i in WeightType],
|
169 |
-
label="Weights type",
|
170 |
-
multiselect=False,
|
171 |
-
value="Original",
|
172 |
-
interactive=True,
|
173 |
-
)
|
174 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
175 |
-
|
176 |
-
submit_button = gr.Button("Submit Eval")
|
177 |
-
submission_result = gr.Markdown()
|
178 |
-
submit_button.click(
|
179 |
-
add_new_eval,
|
180 |
-
[
|
181 |
-
model_name_textbox,
|
182 |
-
base_model_name_textbox,
|
183 |
-
revision_name_textbox,
|
184 |
-
precision,
|
185 |
-
weight_type,
|
186 |
-
model_type,
|
187 |
-
],
|
188 |
-
submission_result,
|
189 |
-
)
|
190 |
|
191 |
with gr.Row():
|
192 |
with gr.Accordion("📙 Citation", open=False):
|
|
|
10 |
EVALUATION_QUEUE_TEXT,
|
11 |
INTRODUCTION_TEXT,
|
12 |
LLM_BENCHMARKS_TEXT,
|
13 |
+
SUBMIT_FORM,
|
14 |
TITLE,
|
15 |
)
|
16 |
from src.display.css_html_js import custom_css
|
|
|
33 |
def restart_space():
|
34 |
API.restart_space(repo_id=REPO_ID)
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
LEADERBOARD_DF = pd.read_csv("results.csv")
|
38 |
|
39 |
demo = gr.Blocks(css=custom_css)
|
40 |
with demo:
|
|
|
42 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
43 |
|
44 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
45 |
+
with gr.TabItem("🏅 MVRB", elem_id="llm-benchmark-tab-table", id=0):
|
46 |
+
Leaderboard(
|
47 |
+
value=LEADERBOARD_DF,
|
48 |
+
datatype=[c.type for c in fields(AutoEvalColumn)],
|
49 |
+
select_columns=SelectColumns(
|
50 |
+
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
51 |
+
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
52 |
+
label="Select Columns to Display:",
|
53 |
+
),
|
54 |
+
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
55 |
+
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
56 |
+
filter_columns=[
|
57 |
+
# ColumnFilter(
|
58 |
+
# "#params",
|
59 |
+
# type="slider",
|
60 |
+
# min=0.1,
|
61 |
+
# max=70,
|
62 |
+
# greater_than=True,
|
63 |
+
# label="Minimum number of parameters (B)",
|
64 |
+
# ),
|
65 |
+
# ColumnFilter(
|
66 |
+
# "#params",
|
67 |
+
# type="slider",
|
68 |
+
# min=0.1,
|
69 |
+
# max=70,
|
70 |
+
# less_than=True,
|
71 |
+
# label="Maximum number of parameters (B)",
|
72 |
+
# ),
|
73 |
+
ColumnFilter("#Params (B)", default=[0.1, 32], min=0.1, max=70, label="Number of parameters (B)")
|
74 |
+
],
|
75 |
+
interactive=True,
|
76 |
+
)
|
77 |
|
78 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
79 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
80 |
|
81 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
82 |
+
gr.Markdown(SUBMIT_FORM, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
with gr.Row():
|
85 |
with gr.Accordion("📙 Citation", open=False):
|
results.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
,#params,Overall,SR,CSR,SQA,OVC
|
2 |
+
UniSE-MLLM,2.21,55.72,69.63,54.49,43.2,48.26
|
3 |
+
UniSE-CLIP,0.428,36.41,35.95,43.38,28.13,40.62
|
src/about.py
CHANGED
@@ -21,20 +21,27 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
|
|
|
|
|
|
|
|
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
32 |
LLM_BENCHMARKS_TEXT = f"""
|
33 |
-
##
|
|
|
|
|
|
|
34 |
|
35 |
-
|
36 |
-
To reproduce our results, here is the commands you can run:
|
37 |
|
|
|
38 |
"""
|
39 |
|
40 |
EVALUATION_QUEUE_TEXT = """
|
@@ -67,6 +74,28 @@ Make sure you have followed the above steps first.
|
|
67 |
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
68 |
"""
|
69 |
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
"""
|
|
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
+
TITLE = """<h1 align="center" id="space-title">MVRB Leaderboard</h1>"""
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
+
**MVRB (Massive Visualized IR Benchmark)** evaluates multimodal retrievers’ performance on general **Vis-IR** tasks. The benchmark includes various task types, such as screenshot-based multimodal retrieval (screenshot to anything, anything to screenshot) and screenshotconditioned retrieval (e.g., searching for documents using queries conditioned on screenshots). It also covers a variety of important domains, including news, products, papers, and charts.
|
29 |
+
|
30 |
+
More details can be found:
|
31 |
+
Paper: https://arxiv.org/pdf/2502.11431
|
32 |
+
Code: https://github.com/VectorSpaceLab/Vis-IR
|
33 |
"""
|
34 |
|
35 |
# Which evaluations are you running? how can people reproduce what you have?
|
36 |
LLM_BENCHMARKS_TEXT = f"""
|
37 |
+
## Tasks
|
38 |
+
- **Screenshot Retrieval (SR)** consists of evaluation samples, each comprising a textual query q and its relevant screenshot $$s: (q, s)$$. The retrieval model needs to precisely retrieve the relevant screenshot for a testing query from a given corpus $$S$. Each evaluation sample is created in two steps: 1) sample a screenshot $$s$$, 2) prompt the LLM to generate a search query based on the caption of screenshot. We consider seven tasks under this category, including product retrieval, paper retrieval, repo retrieval, news retrieval, chart retrieval, document retrieval, and slide retrieval.
|
39 |
+
|
40 |
+
- **Composed Screenshot Retrieval (CSR)** is made up of sq2s triplets. Given a screenshot $s_1$ and a query q conditioned on $s_1$, the retrieval model needs to retrieve the relevant screenshot $s_2$ from the corpus $S$. We define four tasks for this category, including product discovery, news-to-Wiki, knowledge relation, and Wiki-to-product. All tasks in this category are created by human annotators. For each task, annotators are instructed to identify relevant screenshot pairs and write queries to retrieve $s_2$ based on $s_1$.
|
41 |
|
42 |
+
- **Screenshot Question Answering (SQA)** comprises sq2a triplets. Given a screenshot s and a question q conditioned on s, the retrieval model needs to retrieve the correct answer a from a candidate corpus A. Each evaluation sample is created in three steps: 1) sample a screenshot $$s$$, 2) prompt the MLLM to generate a question $$q$$, 3) prompt the MLLM to generate the answer $$a$$ for $$q$$ based on $$s$$. The following tasks are included in this category: product-QA, news-QA, Wiki-QA, paper-QA, repo-QA.
|
|
|
43 |
|
44 |
+
- Open-Vocab Classification (OVC) is performed using evaluation samples of screenshots and their textual class labels. Given a screenshot s and the label class $$C$$, the retrieval model needs to discriminate the correct label c from $$C$$ based on the embedding similarity. We include the following tasks in this category: product classification, news-topic classification, academic-field classification, knowledge classification. For each task, we employ human labelers to create the label class and assign each screenshot with its correct label.
|
45 |
"""
|
46 |
|
47 |
EVALUATION_QUEUE_TEXT = """
|
|
|
74 |
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
75 |
"""
|
76 |
|
77 |
+
SUBMIT_FORM = """
|
78 |
+
## Make sure you submit your evaluation results in a JSON file with the following format:
|
79 |
+
```json
|
80 |
+
{
|
81 |
+
"Model": "<Model Name>",
|
82 |
+
"#params": "7.11B",
|
83 |
+
"Overall": 30.00,
|
84 |
+
"SR": 30.00,
|
85 |
+
"CSR": 30.00,
|
86 |
+
"VQA": 30.00,
|
87 |
+
"OVC": 30.00,
|
88 |
+
}
|
89 |
+
```
|
90 |
+
|
91 |
+
"""
|
92 |
+
|
93 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite MVRB:"
|
94 |
+
CITATION_BUTTON_TEXT = """
|
95 |
+
@article{liu2025any,
|
96 |
+
title={Any Information Is Just Worth One Single Screenshot: Unifying Search With Visualized Information Retrieval},
|
97 |
+
author={Liu, Ze and Liang, Zhengyang and Zhou, Junjie and Liu, Zheng and Lian, Defu},
|
98 |
+
journal={arXiv preprint arXiv:2502.11431},
|
99 |
+
year={2025}
|
100 |
+
}
|
101 |
"""
|
src/display/utils.py
CHANGED
@@ -23,22 +23,15 @@ class ColumnContent:
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
-
auto_eval_column_dict.append(["
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
|
|
28 |
#Scores
|
29 |
-
auto_eval_column_dict.append(["
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
auto_eval_column_dict.append(["
|
34 |
-
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
35 |
-
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
36 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
-
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
38 |
-
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
39 |
-
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
40 |
-
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
41 |
-
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
# Init
|
26 |
+
auto_eval_column_dict.append(["rank", ColumnContent, ColumnContent("Rank", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
+
auto_eval_column_dict.append(["#params", ColumnContent, ColumnContent("#params(B)", "number", True)])
|
29 |
#Scores
|
30 |
+
auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall", "number", True, never_hidden=True)])
|
31 |
+
auto_eval_column_dict.append(["SR", ColumnContent, ColumnContent("SR", "number", True)])
|
32 |
+
auto_eval_column_dict.append(["CSR", ColumnContent, ColumnContent("CSR", "number", True)])
|
33 |
+
auto_eval_column_dict.append(["SQA", ColumnContent, ColumnContent("SQA", "number", True)])
|
34 |
+
auto_eval_column_dict.append(["OVC", ColumnContent, ColumnContent("OVC", "number", True)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
# We use make dataclass to dynamically fill the scores from Tasks
|
37 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|