Spaces:

BAAI
/

EmbodiedVerse

Running

lixuejing

update

6500fc4 5 days ago

11.4 kB

	from dataclasses import dataclass
	from enum import Enum

	@dataclass
	class Task:
	benchmark: str
	metric: str
	col_name: str


	# Select your tasks here
	# ---------------------------------------------------
	class Tasks(Enum):
	# task_key in the json file, metric_key in the json file, name to display in the leaderboard
	Where2Place = Task("Where2Place", "overall", "Where2Place")
	blink_val_ev= Task("blink_val_ev", "overall", "blink_val_ev")
	cv_bench_test = Task("cv_bench_test", "overall", "cv_bench_test")
	robo_spatial_home_all = Task("robo_spatial_home_all", "overall", "robo_spatial_home_all")
	embspatial_bench = Task("embspatial_bench", "overall", "embspatial_bench")
	all_angles_bench = Task("all_angles_bench", "overall", "all_angles_bench")
	vsi_bench_tiny = Task("vsi_bench_tiny", "overall", "vsi_bench_tiny")
	SAT = Task("SAT", "overall", "SAT")
	egoplan_bench2 = Task("egoplan_bench2", "overall", "egoplan_bench2")
	erqa = Task("erqa", "overall", "erqa")

	class Quotas(Enum):
	Perception = Task("Perception", "overall", "Perception")
	SpatialReasoning = Task("SpatialReasoning", "overall", "SpatialReasoning")
	Prediction = Task("Prediction", "overall", "Prediction")
	Planning = Task("Planning", "overall", "Planning")

	NUM_FEWSHOT = 0 # Change with your few shot
	# ---------------------------------------------------



	# Your leaderboard name
	TITLE = """<h1 align="center" id="space-title">Open FlagEval-VLM Leaderboard</h1>"""

	# What does your leaderboard evaluate?

	INTRODUCTION_TEXT = """
	欢迎使用FlagEval-Embodied Verse！
	FlagEval-Embodied Verse 旨在通过FlagEval具身工具链跟踪、排名和评估具身大模型（Embodied model），其中FlagEvalMM提供了多模态评估架构，Embodied Verse构建了一种基于具身智能高质量评测数据集的能力体系，Leaderboard则通过榜单实时跟踪并呈现不同具身大模型综合能力。

	Welcome to the FlagEval-Embodied Verse!
	FlagEval-Embodied Verse aims to track, rank, and evaluate embodied large models (Embodied models) through the FlagEval embodied toolchain.
	FlagEvalMM provides a multimodal evaluation framework, while Embodied Verse builds a capability system based on high-quality evaluation datasets for embodied intelligence. The Leaderboard tracks and presents the comprehensive capabilities of different embodied large models in real time through a leaderboard.
	"""
	# Which evaluations are you running? how can people reproduce what you have?
	LLM_BENCHMARKS_TEXT = f"""

	# The Goal of FlagEval - Embodied Verse

	感谢您积极的参与评测，在未来，我们会持续推动 FlagEval - Embodied Verse 更加完善，维护生态开放，欢迎开发者参与评测方法、工具和数据集的探讨，让我们一起建设更加科学、开放的具身评测工具链。

	Thanks for your active participation in the evaluation. In the future, we will continue to promote FlagEval - Embodied Verse to be more perfect and maintain the openness of the ecosystem, and we welcome developers to participate in the discussion of evaluation methodology, tools and datasets, so that we can build a more scientific and open embodied evaluation toolchain together.

	# Context

	FlagEval-Embodied Verse是科学、全面的具身评测工具链，具体包括FlagEvalMM多模态评估框架、Embodied Verse具身智能高质量评测数据集以及Leaderboard具身模型能力可视化榜单。我们希望能够推动更加开放的生态，让具身智能大模型开发者参与进来，为推动具身智能大模型进步做出相应的贡献。为了实现公平性的目标，所有模型都在 FlagEvalMM框架下使用标准化 GPU 和统一环境进行评估，以确保公平性。

	FlagEval-Embodied Verse is a scientific and comprehensive embodied evaluation toolchain, which specifically includes the FlagEvalMM multimodal evaluation framework, the Embodied Verse high-quality embodied intelligence evaluation dataset, and the Leaderboard for visualizing the capabilities of embodied models.

	We hope to promote a more open ecosystem for embodied model developers to participate and contribute accordingly to the advancement of embodied models. To achieve the goal of fairness, all models are evaluated all models are evaluated under the FlagEvalMM framework using standardized GPUs and a unified environment to ensure fairness.

	#How it works

	## Embodied verse tool - FlagEvalMM
	FlagEvalMM是一个开源评估框架，旨在全面评估多模态模型，其提供了一种标准化的方法来评估跨各种任务和指标使用多种模式（文本、图像、视频）的模型。

	- 灵活的架构：支持多个多模态模型和评估任务，包括VQA、图像检索、文本到图像等。
	- 全面的基准与度量：支持最新的和常用的基准和度量。
	- 广泛的模型支持：model_zoo为广泛流行的多模态模型（包括QWenVL和LLaVA）提供了推理支持。此外，它还提供了与基于API的模型（如GPT、Claude和HuanYuan）的无缝集成。
	- 可扩展的设计：易于扩展，可合并新的模型、基准和评估指标。

	FlagEvalMM is an open-source evaluation framework designed to comprehensively assess multimodal models. It provides a standardized way to evaluate models that work with multiple modalities (text, images, video) across various tasks and metrics.

	- Flexible Architecture: Support for multiple multimodal models and evaluation tasks, including: VQA, image retrieval, text-to-image, etc.
	- Comprehensive Benchmarks and Metrics: Support new and commonly used benchmarks and metrics.
	- Extensive Model Support: The model_zoo provides inference support for a wide range of popular multimodal models including QWenVL and LLaVA. Additionally, it offers seamless integration with API-based models such as GPT, Claude, and HuanYuan.
	- Extensible Design: Easily extendable to incorporate new models, benchmarks, and evaluation metrics.

	# Embodied verse

	## Details and logs
	You can find:
	- detailed numerical results in the results Hugging Face dataset: https://huggingface.co/datasets/open-cn-llm-leaderboard/EmbodiedVerse_results
	- community queries and running status in the requests Hugging Face dataset: https://huggingface.co/datasets/open-cn-llm-leaderboard/EmbodiedVerse_requests

	## Useful links
	- [FlagEvalMM](https://github.com/flageval-baai/FlagEvalMM)
	- [FlagEval](https://flageval.baai.ac.cn/#/home)
	- [VLM Leaderboard](https://huggingface.co/spaces/BAAI/open_flageval_vlm_leaderboard)

	"""

	EVALUATION_QUEUE_TEXT = """
	## Evaluation Queue for the FlagEval VLM Leaderboard
	Models added here will be automatically evaluated on the FlagEval cluster.

	Currently, we offer two methods for model evaluation, including API calls and private deployments:
	1. If you choose to evaluate via API call, you need to provide the Model interface, name and corresponding API key.
	2. If you choose to do open source model evaluation directly through huggingface, you don't need to fill in the Model online api url and Model online api key.

	## Open API model Integration Documentation

	For models accessed via API calls (such as OpenAI API, Anthropic API, etc.), the integration process is straightforward and only requires providing necessary configuration information.
	1. model_name: Name of the model to use
	2. api_key: API access key
	3. api_base: Base URL for the API service

	## Adding a Custom Model to the Platform

	This guide explains how to integrate your custom model into the platform by implementing a model adapter and run.sh script. We'll use the Qwen-VL implementation as a reference example.

	### Overview

	To add your custom model, you need to:
	1. Create a custom dataset class
	2. Implement a model adapter class
	3. Set up the initialization and inference pipeline

	### Step-by-Step Implementation

	Here is an example:[model_adapter.py](https://github.com/flageval-baai/FlagEvalMM/blob/main/model_zoo/vlm/qwen_vl/model_adapter.py)

	#### 1. Create Preprocess Custom Dataset Class

	Inherit from `ServerDataset` to handle data loading:
	```python
	# model_adapter.py
	class CustomDataset(ServerDataset):
	def __getitem__(self, index):
	data = self.get_data(index)
	question_id = data["question_id"]
	img_path = data["img_path"]
	qs = data["question"]
	qs, idx = process_images_symbol(qs)
	idx = set(idx)
	img_path_idx = []
	for i in idx:
	if i < len(img_path):
	img_path_idx.append(img_path[i])
	else:
	print("[warning] image index out of range")
	return question_id, img_path_idx, qs
	```

	The function `get_data` returns a structure like this:
	```python
	{
	"img_path": "A list where each element is an absolute path to an image that can be read directly using PIL, cv2, etc.",
	"question": "A string containing the question, where image positions are marked with <image1> <image2>",
	"question_id": "question_id",
	"type": "A string indicating the type of question"
	}
	```

	#### 2. Implement Model Adapter

	Inherit from `BaseModelAdapter` and implement the required methods:
	1. model_init: is responsible for model initialization and serves as the entry point for model loading and setup.
	2. run_one_task: implements the inference pipeline, handling data processing and result generation for a single evaluation task.
	```python
	# model_adapter.py
	class ModelAdapter(BaseModelAdapter):
	def model_init(self, task_info: Dict):
	ckpt_path = task_info["model_path"]
	'''
	Initialize the model and processor here.
	Load your pre-trained model and any required processing tools using the provided checkpoint path.
	'''

	def run_one_task(self, task_name: str, meta_info: Dict[str, Any]):
	results = []
	cnt = 0

	data_loader = self.create_data_loader(
	CustomDataset, task_name, batch_size=1, num_workers=0
	)

	for question_id, img_path, qs in data_loader:

	'''
	Perform model inference here.
	Use the model to generate the 'answer' variable for the given inputs (e.g., question_id, image path, question).
	'''

	results.append(
	{"question_id": question_id, "answer": answer}
	)

	self.save_result(results, meta_info, rank=rank)
	'''
	Save the inference results.
	Use the provided meta_info and rank parameters to manage result storage as needed.
	'''
	```
	Note:
	`results` is a list of dictionaries
	Each dictionary must contain two keys:
	```python
	question_id: identifies the specific question
	answer: contains the model's prediction/output
	```
	After collecting all results, they are saved using `save_result()`

	#### 3. Launch Script (run.sh)
	run.sh is the entry script for launching model evaluation, used to set environment variables and start the evaluation program.

	```python
	#!/bin/bash
	current_file="$0"
	current_dir="$(dirname "$current_file")"
	SERVER_IP=$1
	SERVER_PORT=$2
	PYTHONPATH=$current_dir:$PYTHONPATH python $current_dir/model_adapter.py \
	--server_ip $SERVER_IP \
	--server_port $SERVER_PORT \
	"${@:3}"
	```

	"""

	CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
	CITATION_BUTTON_TEXT = r"""
	"""