Spaces:
Running
Running
import json | |
import os | |
import meeteval.io | |
import pandas as pd | |
from utils import calc_wer, aggregate_wer_metrics | |
from txt_norm import get_text_norm | |
# Constants | |
REFERENCE_BASE_PATH = os.path.abspath("references") # e.g. ./references/single_channel/dataset1.json | |
TASKS_METADATA_PATH = os.path.abspath("tasks_metadata.json") | |
import os | |
def list_files(startpath): | |
for root, dirs, files in os.walk(startpath): | |
level = root.replace(startpath, '').count(os.sep) | |
indent = ' ' * 4 * (level) | |
print('{}{}/'.format(indent, os.path.basename(root))) | |
subindent = ' ' * 4 * (level + 1) | |
for f in files: | |
print('{}{}'.format(subindent, f)) | |
list_files("/data/submissions") | |
class LeaderboardServer: | |
def __init__(self): | |
self.local_leaderboard = os.path.abspath("/data/submissions") | |
self.submisssion_id_to_file = {} # Maps model_id to filepath | |
self.tasks_metadata = json.load(open(TASKS_METADATA_PATH))["tasks"] | |
self.submission_ids = set() | |
self.results_file = os.path.join(self.local_leaderboard, "results.json") | |
os.makedirs(self.local_leaderboard, exist_ok=True) | |
self.fetch_existing_models() | |
self.text_normalizer = get_text_norm("whisper_nsf") | |
def fetch_existing_models(self): | |
self.submisssion_id_to_file.clear() | |
self.submission_ids.clear() | |
results_path = os.path.join(self.local_leaderboard, "results.json") | |
if not os.path.exists(results_path): | |
return | |
with open(results_path) as f: | |
results = json.load(f) | |
for model_id in results: | |
self.submission_ids.add(model_id) | |
hyp_path = os.path.join(self.local_leaderboard, f"{model_id}_hyp.json") | |
self.submisssion_id_to_file[model_id] = hyp_path | |
def prepare_model_for_submission(self, file, metadata, task, datasets): | |
submitted_by = metadata["submitted_by"] | |
model_id = metadata["model_id"] | |
# Run WER eval | |
results = {} | |
hyp_seglst = meeteval.io.load(file) | |
for dataset in datasets: | |
ref_path = os.path.join(REFERENCE_BASE_PATH, task, f"{dataset}.json") | |
ref_seglst = meeteval.io.load(ref_path) | |
sessions = ref_seglst.unique('session_id') | |
local_hyps = hyp_seglst.filter(lambda seg: seg['session_id'] in sessions) | |
ref_seglst = ref_seglst.map(lambda seg: {**seg, "words":self.text_normalizer(seg["words"])}) | |
local_hyps = local_hyps.map(lambda seg: {**seg, "words":self.text_normalizer(seg["words"])}) | |
per_session_wers = calc_wer(tcp_hyp_seglst=local_hyps, ref_seglst=ref_seglst, collar=5, metrics_list=["tcp_wer"]) | |
metrics = aggregate_wer_metrics(per_session_wers, ["tcp_wer"]) | |
results[dataset] = metrics | |
# Update results file | |
results_path = os.path.join(self.local_leaderboard, "results.json") | |
if os.path.exists(results_path): | |
with open(results_path) as f: | |
all_results = json.load(f) | |
else: | |
all_results = {} | |
all_results[model_id] = { | |
"submitted_by": submitted_by, | |
"results": results | |
} | |
with open(results_path, "w") as f: | |
json.dump(all_results, f, indent=2) | |
def update_leaderboard(self): | |
self.fetch_existing_models() | |
def get_leaderboard(self): | |
results_path = os.path.join(self.local_leaderboard, "results.json") | |
if not os.path.exists(results_path): | |
return pd.DataFrame(columns=["No submissions yet"]) | |
with open(results_path) as f: | |
results = json.load(f) | |
rows = [] | |
for model_id, content in results.items(): | |
row = {"Model ID": model_id, "Submitted by": content["submitted_by"]} | |
for k, v in content["results"].items(): | |
row[k] = v.get("tcp_wer", None) | |
rows.append(row) | |
return pd.DataFrame(rows) | |