napolab / extract_portuguese_leaderboard.py
ruanchaves's picture
Upload 14 files
2c482cc verified
#!/usr/bin/env python3
"""
Script to extract data from JSON files in a repository folder
and save it as a CSV file for import into the benchmark.
"""
import pandas as pd
import json
import os
import sys
import argparse
from pathlib import Path
def is_valid_json_file(file_path):
"""
Check if a file is a valid JSON file containing a dict.
Args:
file_path (str): Path to the JSON file
Returns:
bool: True if valid JSON dict, False otherwise
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return isinstance(data, dict)
except (json.JSONDecodeError, FileNotFoundError, UnicodeDecodeError):
return False
def find_json_files(repo_path):
"""
Recursively find all JSON files in the repository folder.
Args:
repo_path (str): Path to the repository folder
Returns:
list: List of paths to valid JSON files
"""
json_files = []
repo_path = Path(repo_path)
if not repo_path.exists():
print(f"Error: Repository path '{repo_path}' does not exist.")
return []
if not repo_path.is_dir():
print(f"Error: Repository path '{repo_path}' is not a directory.")
return []
print(f"Scanning repository: {repo_path}")
for file_path in repo_path.rglob("*.json"):
if is_valid_json_file(file_path):
json_files.append(file_path)
print(f"Found valid JSON file: {file_path}")
print(f"Total valid JSON files found: {len(json_files)}")
return json_files
def extract_data_from_json(json_file_path):
"""
Extract data from a single JSON file.
Args:
json_file_path (Path): Path to the JSON file
Returns:
dict or None: Extracted data or None if extraction failed
"""
try:
with open(json_file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Check if required fields exist
if 'config_general' not in data or 'results' not in data:
return None
config_general = data['config_general']
results = data['results']
# Extract model information
model_name = config_general.get('model_name', '')
model_private = config_general.get('model_private', False)
model_num_parameters = config_general.get('model_num_parameters', 0)
# Extract results
all_grouped = results.get('all_grouped', {})
# Extract metrics
assin2_rte = all_grouped.get('assin2_rte', 0.0)
assin2_sts = all_grouped.get('assin2_sts', 0.0)
faquad_nli = all_grouped.get('faquad_nli', 0.0)
hatebr_offensive = all_grouped.get('hatebr_offensive', 0.0)
# Create row data
row_data = {
'json_file': str(json_file_path),
'model_name': model_name,
'model_private': model_private,
'model_num_parameters': model_num_parameters,
'assin2_rte': assin2_rte,
'assin2_sts': assin2_sts,
'faquad_nli': faquad_nli,
'hatebr_offensive': hatebr_offensive
}
return row_data
except Exception as e:
print(f"Error processing {json_file_path}: {e}")
return None
def extract_portuguese_leaderboard(repo_path):
"""
Extract data from JSON files in the repository folder and save as CSV.
Args:
repo_path (str): Path to the repository folder
"""
print("Scanning repository for JSON files...")
# Find all JSON files
json_files = find_json_files(repo_path)
if not json_files:
print("No valid JSON files found in the repository.")
return
# Prepare data for DataFrame
data = []
# Process each JSON file
for i, json_file in enumerate(json_files):
print(f"Processing file {i+1}/{len(json_files)}: {json_file.name}")
row_data = extract_data_from_json(json_file)
if row_data:
data.append(row_data)
# Print progress every 10 files
if (i + 1) % 10 == 0:
print(f" Processed {i + 1} files...")
if not data:
print("No valid data extracted from JSON files.")
return
# Create DataFrame
df = pd.DataFrame(data)
# Write to CSV
output_file = 'portuguese_leaderboard.csv'
df.to_csv(output_file, index=False)
print(f"\nSuccessfully extracted {len(df)} models to {output_file}")
# Show first few entries as preview
print("\nFirst 5 entries:")
print(df.head().to_string(index=False))
# Show some statistics
if not df.empty:
print(f"\nStatistics:")
print(f"Total models: {len(df)}")
print(f"Private models: {df['model_private'].sum()}")
print(f"Public models: {(~df['model_private']).sum()}")
# Average scores
print(f"\nAverage scores:")
print(df[['assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']].mean().round(2))
# Show data types and info
print(f"\nDataFrame info:")
print(df.info())
def main():
"""Main function to run the extraction."""
parser = argparse.ArgumentParser(description='Extract Portuguese LLM Leaderboard data from JSON files')
parser.add_argument('repo_path', help='Path to the repository folder containing JSON files')
args = parser.parse_args()
print("Portuguese LLM Leaderboard Data Extractor")
print("=" * 50)
try:
extract_portuguese_leaderboard(args.repo_path)
print("\nExtraction completed successfully!")
except Exception as e:
print(f"Error during extraction: {e}")
sys.exit(1)
if __name__ == "__main__":
main()