Spaces:

osunlp
/

Online_Mind2Web_Leaderboard

Running

App Files Files Community

WeijianQi1999 commited on Mar 25

Commit

4dcba74

1 Parent(s): 3489875

add '-'

Browse files

Files changed (2) hide show

app.py +2 -2
content.py +6 -4

app.py CHANGED Viewed

@@ -12,10 +12,10 @@ from content import format_error, format_warning, format_log, TITLE, LINKS, INTR
 TOKEN = os.environ.get("TOKEN", None)
-OWNER="Online Mind2Web"
 # api = HfApi()
-YEAR_VERSION = "2024"
 LOCAL_DEBUG = True

 TOKEN = os.environ.get("TOKEN", None)
+OWNER="Online-Mind2Web"
 # api = HfApi()
+YEAR_VERSION = "2025"
 LOCAL_DEBUG = True

content.py CHANGED Viewed

@@ -1,4 +1,4 @@
-TITLE = """<h1 align="center" id="space-title">🏆 Online Mind2Web Leaderboard</h1>"""
 LINKS = """
 <div align="center">
     <a href="https://tiancixue.notion.site/An-Illusion-of-Progress-Assessing-the-Current-State-of-Web-Agents-1ac6cd2b9aac80719cd6f68374aaf4b4?pvs=4">Blog</a> |
@@ -9,7 +9,7 @@ LINKS = """
 """
 INTRODUCTION_TEXT = """
-Online Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains.
 Based on the number of steps required by human annotators, tasks are divided into three difficulty levels: Easy (1–5 steps), Medium (6–10 steps), and Hard (11+ steps).
 """
@@ -53,6 +53,7 @@ CITATION_BUTTON_TEXT = r"""
 SUBMIT_INTRODUCTION = """
 ## ⚠ Please submit the trajectory file with the following format:
 Each task is stored in a folder named after its `task_id`, containing:
 - `trajectory/`: Stores screenshots of each step.
@@ -79,10 +80,11 @@ main_directory/
 ```
 Please send your agent's name, model family, and organization via email to [email protected], along with the trajectory directory attached.
-We will run the auto-evaluation. If you have conducted your own human evaluation, please also attach your human eval results—we will spot-check these before adding them to the human-eval table.
 """
-DATA_DATASET = """## More Statistics for Online Mind2Web Benchmark
 """

+TITLE = """<h1 align="center" id="space-title">🏆 Online-Mind2Web Leaderboard</h1>"""
 LINKS = """
 <div align="center">
     <a href="https://tiancixue.notion.site/An-Illusion-of-Progress-Assessing-the-Current-State-of-Web-Agents-1ac6cd2b9aac80719cd6f68374aaf4b4?pvs=4">Blog</a> |
 """
 INTRODUCTION_TEXT = """
+Online-Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains.
 Based on the number of steps required by human annotators, tasks are divided into three difficulty levels: Easy (1–5 steps), Medium (6–10 steps), and Hard (11+ steps).
 """
 SUBMIT_INTRODUCTION = """
 ## ⚠ Please submit the trajectory file with the following format:
 Each task is stored in a folder named after its `task_id`, containing:
 - `trajectory/`: Stores screenshots of each step.
 ```
 Please send your agent's name, model family, and organization via email to [email protected], along with the trajectory directory attached.
+Here is an [example](https://github.com/OSU-NLP-Group/Online-Mind2Web/tree/main/data/example/fb7b4f784cfde003e2548fdf4e8d6b4f) of the format. We encourage you to use the script provided in our GitHub repository to obtain evaluation results and submit them. To ensure the authenticity and reliability of the reported results, we will also conduct a verification.
+If you have conducted your own human evaluation, please also attach your human eval results—we will spot-check these before adding them to the human-eval table.
 """
+DATA_DATASET = """## More Statistics for Online-Mind2Web Benchmark
 """