Spaces:
Sleeping
Sleeping
import os | |
import logging | |
from typing import List, Dict, Optional, Tuple | |
from pathlib import Path | |
import json | |
import numpy as np | |
logging.basicConfig(level=logging.INFO) | |
class BenchChecker: | |
def __init__(self, base_path: str): | |
"""Initialize BenchChecker with base assets path. | |
Args: | |
base_path (str): Base path to assets directory containing benchmark folders | |
""" | |
self.base_path = Path(base_path) | |
self.logger = logging.getLogger(__name__) | |
def check_benchmark_exists(self, benchmark_name: str) -> bool: | |
"""Check if benchmark folder exists.""" | |
benchmark_path = self.base_path / benchmark_name | |
exists = benchmark_path.exists() and benchmark_path.is_dir() | |
if exists: | |
self.logger.info(f"Found benchmark directory: {benchmark_name}") | |
else: | |
self.logger.error(f"Benchmark directory not found: {benchmark_name}") | |
return exists | |
def get_video_list(self, benchmark_name: str) -> List[str]: | |
"""Get list of videos from benchmark's dataset directory. Return empty list if no videos found.""" | |
dataset_path = self.base_path / benchmark_name / "dataset" | |
videos = [] | |
if not dataset_path.exists(): | |
self.logger.info(f"Dataset directory exists but no videos found for {benchmark_name}") | |
return videos # 빈 리스트 반환 | |
# Recursively find all .mp4 files | |
for category in dataset_path.glob("*"): | |
if category.is_dir(): | |
for video_file in category.glob("*.mp4"): | |
videos.append(video_file.stem) | |
self.logger.info(f"Found {len(videos)} videos in {benchmark_name} dataset") | |
return videos | |
def check_model_exists(self, benchmark_name: str, model_name: str) -> bool: | |
"""Check if model directory exists in benchmark's models directory.""" | |
model_path = self.base_path / benchmark_name / "models" / model_name | |
exists = model_path.exists() and model_path.is_dir() | |
if exists: | |
self.logger.info(f"Found model directory: {model_name}") | |
else: | |
self.logger.error(f"Model directory not found: {model_name}") | |
return exists | |
def check_cfg_files(self, benchmark_name: str, model_name: str, cfg_prompt: str) -> Tuple[bool, bool]: | |
"""Check if CFG files/directories exist in both benchmark and model directories.""" | |
# Check benchmark CFG json | |
benchmark_cfg = self.base_path / benchmark_name / "CFG" / f"{cfg_prompt}.json" | |
benchmark_cfg_exists = benchmark_cfg.exists() and benchmark_cfg.is_file() | |
# Check model CFG directory | |
model_cfg = self.base_path / benchmark_name / "models" / model_name / "CFG" / cfg_prompt | |
model_cfg_exists = model_cfg.exists() and model_cfg.is_dir() | |
if benchmark_cfg_exists: | |
self.logger.info(f"Found benchmark CFG file: {cfg_prompt}.json") | |
else: | |
self.logger.error(f"Benchmark CFG file not found: {cfg_prompt}.json") | |
if model_cfg_exists: | |
self.logger.info(f"Found model CFG directory: {cfg_prompt}") | |
else: | |
self.logger.error(f"Model CFG directory not found: {cfg_prompt}") | |
return benchmark_cfg_exists, model_cfg_exists | |
def check_vector_files(self, benchmark_name: str, model_name: str, video_list: List[str]) -> bool: | |
"""Check if video vectors match with dataset.""" | |
vector_path = self.base_path / benchmark_name / "models" / model_name / "vector" / "video" | |
# 비디오가 없는 경우는 무조건 False | |
if not video_list: | |
self.logger.error("No videos found in dataset - cannot proceed") | |
return False | |
# 벡터 디렉토리가 있는지 확인 | |
if not vector_path.exists(): | |
self.logger.error("Vector directory doesn't exist") | |
return False | |
# 벡터 파일 리스트 가져오기 | |
# vector_files = [f.stem for f in vector_path.glob("*.npy")] | |
vector_files = [f.stem for f in vector_path.rglob("*.npy")] | |
missing_vectors = set(video_list) - set(vector_files) | |
extra_vectors = set(vector_files) - set(video_list) | |
if missing_vectors: | |
self.logger.error(f"Missing vectors for videos: {missing_vectors}") | |
return False | |
if extra_vectors: | |
self.logger.error(f"Extra vectors found: {extra_vectors}") | |
return False | |
self.logger.info(f"Vector status: videos={len(video_list)}, vectors={len(vector_files)}") | |
return len(video_list) == len(vector_files) | |
def check_metrics_file(self, benchmark_name: str, model_name: str, cfg_prompt: str) -> bool: | |
"""Check if overall_metrics.json exists in the model's CFG/metrics directory.""" | |
metrics_path = self.base_path / benchmark_name / "models" / model_name / "CFG" / cfg_prompt / "metric" / "overall_metrics.json" | |
exists = metrics_path.exists() and metrics_path.is_file() | |
if exists: | |
self.logger.info(f"Found overall metrics file for {model_name}") | |
else: | |
self.logger.error(f"Overall metrics file not found for {model_name}") | |
return exists | |
def check_benchmark(self, benchmark_name: str, model_name: str, cfg_prompt: str) -> Dict[str, bool]: | |
""" | |
Perform all benchmark checks and return status. | |
""" | |
status = { | |
'benchmark_exists': False, | |
'model_exists': False, | |
'cfg_files_exist': False, | |
'vectors_match': False, | |
'metrics_exist': False | |
} | |
# Check benchmark directory | |
status['benchmark_exists'] = self.check_benchmark_exists(benchmark_name) | |
if not status['benchmark_exists']: | |
return status | |
# Get video list | |
video_list = self.get_video_list(benchmark_name) | |
# Check model directory | |
status['model_exists'] = self.check_model_exists(benchmark_name, model_name) | |
if not status['model_exists']: | |
return status | |
# Check CFG files | |
benchmark_cfg, model_cfg = self.check_cfg_files(benchmark_name, model_name, cfg_prompt) | |
status['cfg_files_exist'] = benchmark_cfg and model_cfg | |
if not status['cfg_files_exist']: | |
return status | |
# Check vectors | |
status['vectors_match'] = self.check_vector_files(benchmark_name, model_name, video_list) | |
# Check metrics file (only if vectors match) | |
if status['vectors_match']: | |
status['metrics_exist'] = self.check_metrics_file(benchmark_name, model_name, cfg_prompt) | |
return status | |
def get_benchmark_status(self, check_status: Dict[str, bool]) -> str: | |
"""Determine which execution path to take based on check results.""" | |
basic_checks = ['benchmark_exists', 'model_exists', 'cfg_files_exist'] | |
if not all(check_status[check] for check in basic_checks): | |
return "cannot_execute" | |
if check_status['vectors_match'] and check_status['metrics_exist']: | |
return "all_passed" | |
elif not check_status['vectors_match']: | |
return "no_vectors" | |
else: # vectors exist but no metrics | |
return "no_metrics" | |
# Example usage | |
if __name__ == "__main__": | |
bench_checker = BenchChecker("assets") | |
status = bench_checker.check_benchmark( | |
benchmark_name="huggingface_benchmarks_dataset", | |
model_name="MSRVTT", | |
cfg_prompt="topk" | |
) | |
execution_path = bench_checker.get_benchmark_status(status) | |
print(f"Checks completed. Execution path: {execution_path}") | |
print(f"Status: {status}") |