Spaces:

BUT-FIT
/

EMMA_leaderboard

Running

File size: 3,982 Bytes

034cbdf
 
 
 
 
 
 
 
 
9f2e90d
 
518e529
034cbdf
518e529
 
 
 
 
 
 
 
 
034cbdf
 
 
b19aff6
034cbdf
9f2e90d
034cbdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f2e90d
034cbdf

import json
import os

import meeteval.io
import pandas as pd
from utils import calc_wer, aggregate_wer_metrics
from txt_norm import get_text_norm

# Constants
REFERENCE_BASE_PATH = os.path.abspath("references")  # e.g. ./references/single_channel/dataset1.json
TASKS_METADATA_PATH = os.path.abspath("tasks_metadata.json")
import os

def list_files(startpath):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))
list_files("/data/submissions")

class LeaderboardServer:
    def __init__(self):
        self.local_leaderboard = os.path.abspath("/data/submissions")
        self.submisssion_id_to_file = {}  # Maps model_id to filepath
        self.tasks_metadata = json.load(open(TASKS_METADATA_PATH))["tasks"]
        self.submission_ids = set()
        self.results_file = os.path.join(self.local_leaderboard, "results.json")
        os.makedirs(self.local_leaderboard, exist_ok=True)
        self.fetch_existing_models()
        self.text_normalizer = get_text_norm("whisper_nsf")

    def fetch_existing_models(self):
        self.submisssion_id_to_file.clear()
        self.submission_ids.clear()

        results_path = os.path.join(self.local_leaderboard, "results.json")
        if not os.path.exists(results_path):
            return
        with open(results_path) as f:
            results = json.load(f)
        for model_id in results:
            self.submission_ids.add(model_id)
            hyp_path = os.path.join(self.local_leaderboard, f"{model_id}_hyp.json")
            self.submisssion_id_to_file[model_id] = hyp_path

    def prepare_model_for_submission(self, file, metadata, task, datasets):
        submitted_by = metadata["submitted_by"]
        model_id = metadata["model_id"]

        # Run WER eval
        results = {}
        hyp_seglst = meeteval.io.load(file)

        for dataset in datasets:
            ref_path = os.path.join(REFERENCE_BASE_PATH, task, f"{dataset}.json")
            ref_seglst = meeteval.io.load(ref_path)
            sessions = ref_seglst.unique('session_id')
            local_hyps = hyp_seglst.filter(lambda seg: seg['session_id'] in sessions)
            ref_seglst = ref_seglst.map(lambda seg: {**seg, "words":self.text_normalizer(seg["words"])})
            local_hyps = local_hyps.map(lambda seg: {**seg, "words":self.text_normalizer(seg["words"])})
            per_session_wers = calc_wer(tcp_hyp_seglst=local_hyps, ref_seglst=ref_seglst, collar=5, metrics_list=["tcp_wer"])
            metrics = aggregate_wer_metrics(per_session_wers, ["tcp_wer"])
            results[dataset] = metrics

        # Update results file
        results_path = os.path.join(self.local_leaderboard, "results.json")
        if os.path.exists(results_path):
            with open(results_path) as f:
                all_results = json.load(f)
        else:
            all_results = {}

        all_results[model_id] = {
            "submitted_by": submitted_by,
            "results": results
        }

        with open(results_path, "w") as f:
            json.dump(all_results, f, indent=2)

    def update_leaderboard(self):
        self.fetch_existing_models()

    def get_leaderboard(self):
        results_path = os.path.join(self.local_leaderboard, "results.json")
        if not os.path.exists(results_path):
            return pd.DataFrame(columns=["No submissions yet"])

        with open(results_path) as f:
            results = json.load(f)

        rows = []
        for model_id, content in results.items():
            row = {"Model ID": model_id, "Submitted by": content["submitted_by"]}
            for k, v in content["results"].items():
                row[k] = v.get("tcp_wer", None)
            rows.append(row)

        return pd.DataFrame(rows)