jisujang's picture
first
a005c19
import os
import logging
from typing import List, Dict, Optional, Tuple
from pathlib import Path
import json
import numpy as np
logging.basicConfig(level=logging.INFO)
class BenchChecker:
def __init__(self, base_path: str):
"""Initialize BenchChecker with base assets path.
Args:
base_path (str): Base path to assets directory containing benchmark folders
"""
self.base_path = Path(base_path)
self.logger = logging.getLogger(__name__)
def check_benchmark_exists(self, benchmark_name: str) -> bool:
"""Check if benchmark folder exists."""
benchmark_path = self.base_path / benchmark_name
exists = benchmark_path.exists() and benchmark_path.is_dir()
if exists:
self.logger.info(f"Found benchmark directory: {benchmark_name}")
else:
self.logger.error(f"Benchmark directory not found: {benchmark_name}")
return exists
def get_video_list(self, benchmark_name: str) -> List[str]:
"""Get list of videos from benchmark's dataset directory. Return empty list if no videos found."""
dataset_path = self.base_path / benchmark_name / "dataset"
videos = []
if not dataset_path.exists():
self.logger.info(f"Dataset directory exists but no videos found for {benchmark_name}")
return videos # 빈 리스트 반환
# Recursively find all .mp4 files
for category in dataset_path.glob("*"):
if category.is_dir():
for video_file in category.glob("*.mp4"):
videos.append(video_file.stem)
self.logger.info(f"Found {len(videos)} videos in {benchmark_name} dataset")
return videos
def check_model_exists(self, benchmark_name: str, model_name: str) -> bool:
"""Check if model directory exists in benchmark's models directory."""
model_path = self.base_path / benchmark_name / "models" / model_name
exists = model_path.exists() and model_path.is_dir()
if exists:
self.logger.info(f"Found model directory: {model_name}")
else:
self.logger.error(f"Model directory not found: {model_name}")
return exists
def check_cfg_files(self, benchmark_name: str, model_name: str, cfg_prompt: str) -> Tuple[bool, bool]:
"""Check if CFG files/directories exist in both benchmark and model directories."""
# Check benchmark CFG json
benchmark_cfg = self.base_path / benchmark_name / "CFG" / f"{cfg_prompt}.json"
benchmark_cfg_exists = benchmark_cfg.exists() and benchmark_cfg.is_file()
# Check model CFG directory
model_cfg = self.base_path / benchmark_name / "models" / model_name / "CFG" / cfg_prompt
model_cfg_exists = model_cfg.exists() and model_cfg.is_dir()
if benchmark_cfg_exists:
self.logger.info(f"Found benchmark CFG file: {cfg_prompt}.json")
else:
self.logger.error(f"Benchmark CFG file not found: {cfg_prompt}.json")
if model_cfg_exists:
self.logger.info(f"Found model CFG directory: {cfg_prompt}")
else:
self.logger.error(f"Model CFG directory not found: {cfg_prompt}")
return benchmark_cfg_exists, model_cfg_exists
def check_vector_files(self, benchmark_name: str, model_name: str, video_list: List[str]) -> bool:
"""Check if video vectors match with dataset."""
vector_path = self.base_path / benchmark_name / "models" / model_name / "vector" / "video"
# 비디오가 없는 경우는 무조건 False
if not video_list:
self.logger.error("No videos found in dataset - cannot proceed")
return False
# 벡터 디렉토리가 있는지 확인
if not vector_path.exists():
self.logger.error("Vector directory doesn't exist")
return False
# 벡터 파일 리스트 가져오기
# vector_files = [f.stem for f in vector_path.glob("*.npy")]
vector_files = [f.stem for f in vector_path.rglob("*.npy")]
missing_vectors = set(video_list) - set(vector_files)
extra_vectors = set(vector_files) - set(video_list)
if missing_vectors:
self.logger.error(f"Missing vectors for videos: {missing_vectors}")
return False
if extra_vectors:
self.logger.error(f"Extra vectors found: {extra_vectors}")
return False
self.logger.info(f"Vector status: videos={len(video_list)}, vectors={len(vector_files)}")
return len(video_list) == len(vector_files)
def check_metrics_file(self, benchmark_name: str, model_name: str, cfg_prompt: str) -> bool:
"""Check if overall_metrics.json exists in the model's CFG/metrics directory."""
metrics_path = self.base_path / benchmark_name / "models" / model_name / "CFG" / cfg_prompt / "metric" / "overall_metrics.json"
exists = metrics_path.exists() and metrics_path.is_file()
if exists:
self.logger.info(f"Found overall metrics file for {model_name}")
else:
self.logger.error(f"Overall metrics file not found for {model_name}")
return exists
def check_benchmark(self, benchmark_name: str, model_name: str, cfg_prompt: str) -> Dict[str, bool]:
"""
Perform all benchmark checks and return status.
"""
status = {
'benchmark_exists': False,
'model_exists': False,
'cfg_files_exist': False,
'vectors_match': False,
'metrics_exist': False
}
# Check benchmark directory
status['benchmark_exists'] = self.check_benchmark_exists(benchmark_name)
if not status['benchmark_exists']:
return status
# Get video list
video_list = self.get_video_list(benchmark_name)
# Check model directory
status['model_exists'] = self.check_model_exists(benchmark_name, model_name)
if not status['model_exists']:
return status
# Check CFG files
benchmark_cfg, model_cfg = self.check_cfg_files(benchmark_name, model_name, cfg_prompt)
status['cfg_files_exist'] = benchmark_cfg and model_cfg
if not status['cfg_files_exist']:
return status
# Check vectors
status['vectors_match'] = self.check_vector_files(benchmark_name, model_name, video_list)
# Check metrics file (only if vectors match)
if status['vectors_match']:
status['metrics_exist'] = self.check_metrics_file(benchmark_name, model_name, cfg_prompt)
return status
def get_benchmark_status(self, check_status: Dict[str, bool]) -> str:
"""Determine which execution path to take based on check results."""
basic_checks = ['benchmark_exists', 'model_exists', 'cfg_files_exist']
if not all(check_status[check] for check in basic_checks):
return "cannot_execute"
if check_status['vectors_match'] and check_status['metrics_exist']:
return "all_passed"
elif not check_status['vectors_match']:
return "no_vectors"
else: # vectors exist but no metrics
return "no_metrics"
# Example usage
if __name__ == "__main__":
bench_checker = BenchChecker("assets")
status = bench_checker.check_benchmark(
benchmark_name="huggingface_benchmarks_dataset",
model_name="MSRVTT",
cfg_prompt="topk"
)
execution_path = bench_checker.get_benchmark_status(status)
print(f"Checks completed. Execution path: {execution_path}")
print(f"Status: {status}")