Commit
·
927e909
1
Parent(s):
0abc04f
Add DeepResearch Bench application with LFS support
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- .gitignore +11 -0
- Dockerfile +11 -0
- README.md +25 -12
- __pycache__/create_leaderboard.cpython-38.pyc +0 -0
- __pycache__/create_leaderboard.cpython-39.pyc +0 -0
- __pycache__/gradio.cpython-310.pyc +0 -0
- __pycache__/gradio.cpython-39.pyc +0 -0
- app.py +16 -0
- create_leaderboard.py +91 -0
- data/data_viewer.jsonl +3 -0
- data/leaderboard.csv +17 -0
- data/raw_data/claude-3-5-sonnet-with-search.jsonl +3 -0
- data/raw_data/claude-3-7-sonnet-with-search.jsonl +3 -0
- data/raw_data/gemini-2.5-flash-with-grounding.jsonl +3 -0
- data/raw_data/gemini-2.5-pro-deepresearch.jsonl +3 -0
- data/raw_data/gemini-2.5-pro-with-grounding.jsonl +3 -0
- data/raw_data/gpt-4.1-mini-with-search.jsonl +3 -0
- data/raw_data/gpt-4.1-with-search.jsonl +3 -0
- data/raw_data/gpt-4o-mini-search-preview.jsonl +3 -0
- data/raw_data/gpt-4o-search-preview.jsonl +3 -0
- data/raw_data/grok-deeper-search.jsonl +3 -0
- data/raw_data/openai-deepresearch.jsonl +3 -0
- data/raw_data/perplexity-Research.jsonl +3 -0
- data/raw_data/perplexity-sonar-pro.jsonl +3 -0
- data/raw_data/perplexity-sonar-reasoning-pro.jsonl +3 -0
- data/raw_data/perplexity-sonar-reasoning.jsonl +3 -0
- data/raw_data/perplexity-sonar.jsonl +3 -0
- data/raw_results/claude-3-5-sonnet-with-search.jsonl +3 -0
- data/raw_results/claude-3-7-sonnet-with-search.jsonl +3 -0
- data/raw_results/gemini-2.5-flash-with-grounding.jsonl +3 -0
- data/raw_results/gemini-2.5-pro-deepresearch.jsonl +3 -0
- data/raw_results/gemini-2.5-pro-with-grounding.jsonl +3 -0
- data/raw_results/gpt-4.1-mini-with-search.jsonl +3 -0
- data/raw_results/gpt-4.1-with-search.jsonl +3 -0
- data/raw_results/gpt-4o-mini-search-preview.jsonl +3 -0
- data/raw_results/gpt-4o-search-preview.jsonl +3 -0
- data/raw_results/grok-deeper-search.jsonl +3 -0
- data/raw_results/openai-deepresearch.jsonl +3 -0
- data/raw_results/perplexity-Research.jsonl +3 -0
- data/raw_results/perplexity-sonar-pro.jsonl +3 -0
- data/raw_results/perplexity-sonar-reasoning-pro.jsonl +3 -0
- data/raw_results/perplexity-sonar-reasoning.jsonl +3 -0
- data/raw_results/perplexity-sonar.jsonl +3 -0
- requirements.txt +6 -0
- tabs/__pycache__/data_viewer_side_by_side_tab.cpython-38.pyc +0 -0
- tabs/__pycache__/data_viewer_side_by_side_tab.cpython-39.pyc +0 -0
- tabs/__pycache__/data_viewer_tab.cpython-38.pyc +0 -0
- tabs/__pycache__/data_viewer_tab.cpython-39.pyc +0 -0
- tabs/__pycache__/leaderboard_tab.cpython-38.pyc +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.jsonl filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
*.py[cod]
|
3 |
+
*$py.class
|
4 |
+
.env
|
5 |
+
.venv
|
6 |
+
env/
|
7 |
+
venv/
|
8 |
+
ENV/
|
9 |
+
.DS_Store
|
10 |
+
*.log
|
11 |
+
data/data_viewer.jsonl
|
Dockerfile
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10-slim
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY . /code/
|
6 |
+
|
7 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
8 |
+
pip install --no-cache-dir -r requirements.txt
|
9 |
+
|
10 |
+
# 默认运行命令
|
11 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
@@ -1,12 +1,25 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# DeepResearch Bench
|
2 |
+
|
3 |
+
**DeepResearch Bench: A Comprehensive Benchmark for Deep Research Agents**
|
4 |
+
|
5 |
+
This application showcases comprehensive evaluation results for Deep Research Agents. The app includes:
|
6 |
+
|
7 |
+
- 🏆 **Leaderboard** - View overall performance metrics across all evaluated models
|
8 |
+
- 🔍 **Data Viewer** - Explore detailed results for individual research tasks
|
9 |
+
- 📊 **Side-by-Side Comparison** - Compare different models' responses to the same research questions
|
10 |
+
|
11 |
+
Visit our [project website](https://deepresearch-bench.github.io) for more information.
|
12 |
+
|
13 |
+
## Citation
|
14 |
+
```bibtex
|
15 |
+
@article{du2025deepresearch,
|
16 |
+
author = {Mingxuan Du and Benfeng Xu and Chiwei Zhu and Xiaorui Wang and Zhendong Mao},
|
17 |
+
title = {DeepResearch Bench: A Comprehensive Benchmark for Deep Research Agents},
|
18 |
+
journal = {arXiv preprint},
|
19 |
+
year = {2025},
|
20 |
+
}
|
21 |
+
```
|
22 |
+
|
23 |
+
## Hugging Face Space Details
|
24 |
+
- SDK: Gradio
|
25 |
+
- SDK Version: 3.50.0
|
__pycache__/create_leaderboard.cpython-38.pyc
ADDED
Binary file (2.35 kB). View file
|
|
__pycache__/create_leaderboard.cpython-39.pyc
ADDED
Binary file (2.52 kB). View file
|
|
__pycache__/gradio.cpython-310.pyc
ADDED
Binary file (422 Bytes). View file
|
|
__pycache__/gradio.cpython-39.pyc
ADDED
Binary file (420 Bytes). View file
|
|
app.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
DeepResearch Bench HF Space 入口文件
|
5 |
+
"""
|
6 |
+
|
7 |
+
from __future__ import annotations
|
8 |
+
from create_leaderboard import demo
|
9 |
+
|
10 |
+
# 在Hugging Face Space中运行
|
11 |
+
if __name__ == "__main__":
|
12 |
+
demo.launch(
|
13 |
+
server_name="0.0.0.0", # 必须这样设置以允许外部访问
|
14 |
+
share=False, # HF Space 自己有分享功能,无需额外分享
|
15 |
+
show_api=False, # 隐藏API文档页面
|
16 |
+
)
|
create_leaderboard.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Gradio UI – v2.1 (Leaderboard · Data Viewer · Prompt-to-Leaderboard)
|
5 |
+
"""
|
6 |
+
|
7 |
+
from __future__ import annotations
|
8 |
+
from pathlib import Path
|
9 |
+
import gradio as gr
|
10 |
+
|
11 |
+
# ---- Tab 组件 ----
|
12 |
+
from tabs.leaderboard_tab import create_leaderboard_tab
|
13 |
+
from tabs.data_viewer_tab import create_data_viewer_tab
|
14 |
+
from tabs.data_viewer_side_by_side_tab import create_data_viewer_side_by_side_tab
|
15 |
+
|
16 |
+
# ---------------------------------------------------------------------------
|
17 |
+
# UI
|
18 |
+
# ---------------------------------------------------------------------------
|
19 |
+
|
20 |
+
with gr.Blocks(title="DeepResearch Bench") as demo:
|
21 |
+
|
22 |
+
# ========= 全局 CSS(仅作用于自定义标题 & 简介) =========
|
23 |
+
gr.HTML("""
|
24 |
+
<style>
|
25 |
+
.title-block{
|
26 |
+
/* 渐变文字效果 - 改进版 */
|
27 |
+
background: linear-gradient(to right, #009CFF, #823AFF);
|
28 |
+
background: -webkit-linear-gradient(to right, #009CFF, #823AFF);
|
29 |
+
background: -moz-linear-gradient(to right, #009CFF, #823AFF);
|
30 |
+
-webkit-background-clip: text;
|
31 |
+
-webkit-text-fill-color: transparent;
|
32 |
+
background-clip: text;
|
33 |
+
color: transparent;
|
34 |
+
|
35 |
+
text-align: center;
|
36 |
+
font-size: 2.1rem;
|
37 |
+
font-weight: 700;
|
38 |
+
margin: 0 0 1rem 0;
|
39 |
+
padding-bottom: 0.2rem;
|
40 |
+
display: inline-block; /* 重要:确保渐变效果正常 */
|
41 |
+
width: 100%; /* 确保居中对齐 */
|
42 |
+
}
|
43 |
+
.intro-block{
|
44 |
+
text-align:center;
|
45 |
+
margin-bottom:1.25rem;
|
46 |
+
line-height:2;
|
47 |
+
}
|
48 |
+
.intro-block a{
|
49 |
+
color:#0a58ca;
|
50 |
+
text-decoration:none;
|
51 |
+
margin:0 .3rem;
|
52 |
+
}
|
53 |
+
.intro-block a:hover{ text-decoration:underline; }
|
54 |
+
</style>
|
55 |
+
""")
|
56 |
+
|
57 |
+
# ========= 顶部标题 & 简介(不使用 Markdown 标题语法) =========
|
58 |
+
gr.HTML("""
|
59 |
+
<div class="title-block">
|
60 |
+
DeepResearch Bench: A Comprehensive Benchmark for Deep Research Agents
|
61 |
+
</div>
|
62 |
+
|
63 |
+
<div class="intro-block">
|
64 |
+
The research aims to comprehensively evaluate the capabilities of Deep Research Agents.<br>
|
65 |
+
<a href="#">Code</a> |
|
66 |
+
<a href="#">Website</a> |
|
67 |
+
<a href="#">Paper</a> |
|
68 |
+
<a href="#">Eval Dataset</a> |
|
69 |
+
Total models: 16 | Last Update: 28 May 2025
|
70 |
+
</div>
|
71 |
+
""")
|
72 |
+
|
73 |
+
# ========= 主 Tabs =========
|
74 |
+
with gr.Tabs():
|
75 |
+
create_leaderboard_tab() # 🏆 Leaderboard
|
76 |
+
create_data_viewer_tab() # 🔍 Data Viewer
|
77 |
+
create_data_viewer_side_by_side_tab()
|
78 |
+
|
79 |
+
with gr.Tab("💬Prompt-to-Leaderboard"):
|
80 |
+
gr.Markdown(
|
81 |
+
"""
|
82 |
+
🚧 **Prompt-to-Leaderboard** module not implemented yet.
|
83 |
+
Planned: inspect how individual prompts affect overall model ranking.
|
84 |
+
"""
|
85 |
+
)
|
86 |
+
|
87 |
+
# ---------------------------------------------------------------------------
|
88 |
+
# Entrypoint
|
89 |
+
# ---------------------------------------------------------------------------
|
90 |
+
if __name__ == "__main__":
|
91 |
+
demo.launch()
|
data/data_viewer.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a7ab11f250f4ffd6bf9c74ff8dc1e68f86d7abbf4f6319164bb476177ad7bf6e
|
3 |
+
size 28044256
|
data/leaderboard.csv
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model,overall_score,comprehensiveness,insight,instruction_following,readability,citation_accuracy,effective_citations
|
2 |
+
gemini-2.5-pro-deepresearch,48.88,48.53,48.50,49.18,49.44,81.44,111.21
|
3 |
+
openai-deepresearch,46.98,46.87,45.25,49.27,47.14,77.96,40.79
|
4 |
+
perplexity-Research,42.25,40.69,39.39,46.40,44.28,90.24,31.26
|
5 |
+
claude-3-7-sonnet-with-search,40.67,38.99,37.66,45.77,41.46,93.68,32.48
|
6 |
+
grok-deeper-search,40.24,37.97,35.37,46.30,44.05,83.59,8.15
|
7 |
+
perplexity-sonar-reasoning-pro,40.22,37.38,36.11,45.66,44.74,39.36,8.35
|
8 |
+
perplexity-sonar-reasoning,40.18,37.14,36.73,45.15,44.35,48.67,11.34
|
9 |
+
perplexity-sonar-pro,38.93,36.38,34.26,44.70,43.35,78.66,14.74
|
10 |
+
gemini-2.5-pro-with-grounding,35.12,34.06,29.79,41.67,37.16,81.81,32.88
|
11 |
+
gpt-4o-search-preview,35.10,31.99,27.57,43.17,41.23,88.41,4.79
|
12 |
+
perplexity-sonar,34.54,30.95,27.51,42.33,41.60,74.42,8.67
|
13 |
+
gpt-4.1-with-search,33.46,29.42,25.38,42.33,40.77,87.83,4.42
|
14 |
+
gemini-2.5-flash-preview-04-17,32.39,31.63,26.73,38.82,34.48,81.92,31.08
|
15 |
+
gpt-4o-mini-search-preview,31.55,27.38,22.64,40.67,39.91,84.98,4.95
|
16 |
+
gpt-4.1-mini-with-search,30.26,26.05,20.75,39.65,39.33,84.58,4.35
|
17 |
+
claude-3-5-sonnet-with-search,28.48,24.82,22.82,35.12,35.08,94.04,9.78
|
data/raw_data/claude-3-5-sonnet-with-search.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8769fc2e0cf4f059da6e34839f9df09a6fdab9e2872faa467eafa1aa42316a69
|
3 |
+
size 505860
|
data/raw_data/claude-3-7-sonnet-with-search.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dc16f997d3ecd09bccf6d9e756d9ad36d2834d2ed0827b8f39579b6321b98837
|
3 |
+
size 2281964
|
data/raw_data/gemini-2.5-flash-with-grounding.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:908295fc145ac2f8833396b56eac8913726d93288ad6b93a9c01a69cbdbbf78a
|
3 |
+
size 1016172
|
data/raw_data/gemini-2.5-pro-deepresearch.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:33c5d28e76595f22fae1b0fbbe2700958bfe707dafe53f7c5842d3067ccfddef
|
3 |
+
size 8523353
|
data/raw_data/gemini-2.5-pro-with-grounding.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7b409b1031cff2876fd20cd8e9fc95501f2a95ad0154d3634b6538c165373447
|
3 |
+
size 1050267
|
data/raw_data/gpt-4.1-mini-with-search.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf809806c294364bb45cb337355d360b0e5e023c8e4ffdbf9557880a02137bab
|
3 |
+
size 463012
|
data/raw_data/gpt-4.1-with-search.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e0228624b09e9d6c25c72156f4dd7f5702e3adcdd71a1f309094c2913eb50639
|
3 |
+
size 492406
|
data/raw_data/gpt-4o-mini-search-preview.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd49e75b1e7eb6ff40cd4c030032459d727987dd298863b488b9657ae18815a1
|
3 |
+
size 541532
|
data/raw_data/gpt-4o-search-preview.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bfb9de873345d6789197013f0cd60fb2d888957bc123447f2a8486e81c296f04
|
3 |
+
size 565183
|
data/raw_data/grok-deeper-search.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ea6428dcf2e729d84f019c302fb3862a85cefbea08282b5ffcc5c400306ab077
|
3 |
+
size 1149933
|
data/raw_data/openai-deepresearch.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:77d31b8ea1abd9aa8e924303451dc6a0f334f2e9d4d61ec71847c4db004ac62a
|
3 |
+
size 6903938
|
data/raw_data/perplexity-Research.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7f27cb31cbab84f60efc3286592e84690fd117355dd84f9e4a9299108245c2a5
|
3 |
+
size 1747979
|
data/raw_data/perplexity-sonar-pro.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5d577c0a208b35eb2c0454c00c70b12759cd8a1687f730f2133d8f392c1831ee
|
3 |
+
size 750234
|
data/raw_data/perplexity-sonar-reasoning-pro.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e08c6c4094bbf0aa1749e7b1a45e856a6635b2df6afdf0de8eeafea99e7477fc
|
3 |
+
size 495156
|
data/raw_data/perplexity-sonar-reasoning.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5aecbee30882b3ccd2d65470526fe48c7c016869f00593933a35e7096fe4fb74
|
3 |
+
size 659883
|
data/raw_data/perplexity-sonar.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dc0ef26282e404b700d56e158644f44228c49a3d5126fa12c8068e053444131e
|
3 |
+
size 574856
|
data/raw_results/claude-3-5-sonnet-with-search.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c0c47d1bab126886420bd53bb41a8905cdfb97f105711bcc2f5a27e3d53652ea
|
3 |
+
size 1992421
|
data/raw_results/claude-3-7-sonnet-with-search.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2b3a6bf74400c89d24fa47853ab034ed3696ee0694c2d190ba83c3f5dcd8a0ef
|
3 |
+
size 2002379
|
data/raw_results/gemini-2.5-flash-with-grounding.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:43b9f71819babb5c00f65f0dd71d707323fb803c585bd74976f49cdc34ab80aa
|
3 |
+
size 1951481
|
data/raw_results/gemini-2.5-pro-deepresearch.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ac2fc53c99697e3276c98d735ed630df6fa49d2972c70a5409adc1958ecaa7b7
|
3 |
+
size 1937730
|
data/raw_results/gemini-2.5-pro-with-grounding.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5e911a18cf8b8a8207eb45584ac650e4640f79db7352055ca5e92356de37f911
|
3 |
+
size 1944815
|
data/raw_results/gpt-4.1-mini-with-search.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:948a403d12bcf6b0e3ce6664f83afeb95413684ab0b7912003ed756a4df15c5e
|
3 |
+
size 1992345
|
data/raw_results/gpt-4.1-with-search.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:908a5989af337e381bf2bce6795438edd21966f313b5194f532feb1f47e5b812
|
3 |
+
size 2090582
|
data/raw_results/gpt-4o-mini-search-preview.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4277a9a91fcdaaeff1afe948c1088095d5f01092404fcd1a62407b7a58b7906e
|
3 |
+
size 2074673
|
data/raw_results/gpt-4o-search-preview.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7adcd70d49d3b5dd6050201aa4fcd31f51288945f4a23de14432a301cbf295a7
|
3 |
+
size 2063854
|
data/raw_results/grok-deeper-search.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b19fb7ec93872317eae94abeb02ed9c19912057acfa82600167ca853b750f476
|
3 |
+
size 1968989
|
data/raw_results/openai-deepresearch.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae45c25f5b5c56a772331543e4eefe7c80e63f33b441dfe83cb4a5c830c88a35
|
3 |
+
size 2007501
|
data/raw_results/perplexity-Research.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b7715271d17cc344873653464ae3fef884e0f3c6bec89deee347ed7a0651beb9
|
3 |
+
size 2030483
|
data/raw_results/perplexity-sonar-pro.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a453f5b29492f684f53364121e7c79eeb81aee2737a383e2748830a4e4453afb
|
3 |
+
size 1975770
|
data/raw_results/perplexity-sonar-reasoning-pro.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:669a4a14232c63c716de766af7be050f8712f74a6d5437cc8fa637ded39f3c40
|
3 |
+
size 1957092
|
data/raw_results/perplexity-sonar-reasoning.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bceb5637a9d0092af5ddcca49557a4f8f3604be9ebb430be32e820fa4d6723b3
|
3 |
+
size 1951258
|
data/raw_results/perplexity-sonar.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:36ecd1540447863f66bfe1a43905070f9c9b0d40de803348c3450a396df3d8fc
|
3 |
+
size 2016838
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=3.50.0
|
2 |
+
pandas
|
3 |
+
numpy
|
4 |
+
plotly
|
5 |
+
pathlib
|
6 |
+
requests
|
tabs/__pycache__/data_viewer_side_by_side_tab.cpython-38.pyc
ADDED
Binary file (8.31 kB). View file
|
|
tabs/__pycache__/data_viewer_side_by_side_tab.cpython-39.pyc
ADDED
Binary file (8.41 kB). View file
|
|
tabs/__pycache__/data_viewer_tab.cpython-38.pyc
ADDED
Binary file (6.95 kB). View file
|
|
tabs/__pycache__/data_viewer_tab.cpython-39.pyc
ADDED
Binary file (6.98 kB). View file
|
|
tabs/__pycache__/leaderboard_tab.cpython-38.pyc
ADDED
Binary file (3.2 kB). View file
|
|