| 
							 | 
						import glob | 
					
					
						
						| 
							 | 
						import json | 
					
					
						
						| 
							 | 
						import math | 
					
					
						
						| 
							 | 
						import os | 
					
					
						
						| 
							 | 
						import traceback | 
					
					
						
						| 
							 | 
						from dataclasses import dataclass | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						import dateutil | 
					
					
						
						| 
							 | 
						import numpy as np | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						from huggingface_hub import ModelCard | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						from src.display.formatting import make_clickable_model | 
					
					
						
						| 
							 | 
						from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, QuantType, WeightDtype, ComputeDtype  | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						@dataclass | 
					
					
						
						| 
							 | 
						class EvalResult: | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    eval_name: str  | 
					
					
						
						| 
							 | 
						    full_model: str  | 
					
					
						
						| 
							 | 
						    org: str  | 
					
					
						
						| 
							 | 
						    model: str | 
					
					
						
						| 
							 | 
						    revision: str  | 
					
					
						
						| 
							 | 
						    results: dict | 
					
					
						
						| 
							 | 
						    quant_type: QuantType = QuantType.Unknown | 
					
					
						
						| 
							 | 
						    precision: Precision = Precision.Unknown | 
					
					
						
						| 
							 | 
						    weight_dtype: WeightDtype = WeightDtype.Unknown | 
					
					
						
						| 
							 | 
						    compute_dtype: ComputeDtype = ComputeDtype.Unknown | 
					
					
						
						| 
							 | 
						    double_quant: bool = False  | 
					
					
						
						| 
							 | 
						    model_type: ModelType = ModelType.Unknown  | 
					
					
						
						| 
							 | 
						    weight_type: WeightType = WeightType.Original  | 
					
					
						
						| 
							 | 
						    architecture: str = "Unknown"  | 
					
					
						
						| 
							 | 
						    license: str = "?" | 
					
					
						
						| 
							 | 
						    likes: int = 0 | 
					
					
						
						| 
							 | 
						    num_params: int = 0 | 
					
					
						
						| 
							 | 
						    model_size: int = 0 | 
					
					
						
						| 
							 | 
						    group_size: int = -1 | 
					
					
						
						| 
							 | 
						    date: str = ""  | 
					
					
						
						| 
							 | 
						    still_on_hub: bool = True | 
					
					
						
						| 
							 | 
						    is_merge: bool = False | 
					
					
						
						| 
							 | 
						    flagged: bool = False | 
					
					
						
						| 
							 | 
						    status: str = "Finished" | 
					
					
						
						| 
							 | 
						    tags: list = None | 
					
					
						
						| 
							 | 
						    result_file: str = "" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    @classmethod | 
					
					
						
						| 
							 | 
						    def init_from_json_file(self, json_filepath): | 
					
					
						
						| 
							 | 
						        """Inits the result from the specific model result file""" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        result_file = "/".join(json_filepath.split("/")[2:]) | 
					
					
						
						| 
							 | 
						        with open(json_filepath) as fp: | 
					
					
						
						| 
							 | 
						            data = json.load(fp) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        config = data.get("config_general") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        precision = Precision.from_str(config.get("precision", "4bit")) | 
					
					
						
						| 
							 | 
						        quant_type = QuantType.from_str(str(config.get("quant_type", "GPTQ"))) | 
					
					
						
						| 
							 | 
						        weight_dtype = WeightDtype.from_str(data["task_info"].get("weight_dtype", "int4")) | 
					
					
						
						| 
							 | 
						        compute_dtype = ComputeDtype.from_str(data["task_info"].get("compute_dtype", "bfloat16")) | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        model_params = round(float(config["model_params"]), 2) | 
					
					
						
						| 
							 | 
						        model_size = round(float(config["model_size"]), 2) | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if data.get("quantization_config", None): | 
					
					
						
						| 
							 | 
						            double_quant = data["quantization_config"].get("bnb_4bit_use_double_quant", False) | 
					
					
						
						| 
							 | 
						            group_size = data["quantization_config"].get("group_size", -1) | 
					
					
						
						| 
							 | 
						        else: | 
					
					
						
						| 
							 | 
						            double_quant = False | 
					
					
						
						| 
							 | 
						            group_size = -1 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        local = config.get("local", False) | 
					
					
						
						| 
							 | 
						        if not local: | 
					
					
						
						| 
							 | 
						            local = data["task_info"].get("local", False) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        org_and_model = config.get("model_name") | 
					
					
						
						| 
							 | 
						        org_and_model = org_and_model.split("/", 1) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        if local and org_and_model[0] != "Intel": | 
					
					
						
						| 
							 | 
						            org_and_model = config.get("model_name").split("/") | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            org_and_model = ["local", org_and_model[-1]] | 
					
					
						
						| 
							 | 
						            quant_type = QuantType.autoround | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        if len(org_and_model) == 1: | 
					
					
						
						| 
							 | 
						            org = None | 
					
					
						
						| 
							 | 
						            model = org_and_model[0] | 
					
					
						
						| 
							 | 
						            result_key = f"{model}_{precision.value.name}" | 
					
					
						
						| 
							 | 
						        else: | 
					
					
						
						| 
							 | 
						            org = org_and_model[0] | 
					
					
						
						| 
							 | 
						            model = org_and_model[1] | 
					
					
						
						| 
							 | 
						            result_key = f"{org}_{model}_{precision.value.name}" | 
					
					
						
						| 
							 | 
						        full_model = "/".join(org_and_model) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        results = {} | 
					
					
						
						| 
							 | 
						        for task in Tasks: | 
					
					
						
						| 
							 | 
						            task = task.value | 
					
					
						
						| 
							 | 
						            if task.benchmark == "mmlu": | 
					
					
						
						| 
							 | 
						                accs = np.array([data["results"]["harness|mmlu|0"][task.metric]]) | 
					
					
						
						| 
							 | 
						            else: | 
					
					
						
						| 
							 | 
						                accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k]) | 
					
					
						
						| 
							 | 
						            if accs.size == 0 or any([acc is None for acc in accs]): | 
					
					
						
						| 
							 | 
						                continue | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						            mean_acc = np.mean(accs) * 100.0 | 
					
					
						
						| 
							 | 
						            mean_acc = round(mean_acc, 2) | 
					
					
						
						| 
							 | 
						            results[task.benchmark] = mean_acc | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        return self( | 
					
					
						
						| 
							 | 
						            eval_name=result_key, | 
					
					
						
						| 
							 | 
						            full_model=full_model, | 
					
					
						
						| 
							 | 
						            org=org, | 
					
					
						
						| 
							 | 
						            model=model, | 
					
					
						
						| 
							 | 
						            results=results, | 
					
					
						
						| 
							 | 
						            precision=precision, | 
					
					
						
						| 
							 | 
						            quant_type=quant_type, | 
					
					
						
						| 
							 | 
						            weight_dtype=weight_dtype, | 
					
					
						
						| 
							 | 
						            compute_dtype=compute_dtype, | 
					
					
						
						| 
							 | 
						            double_quant=double_quant, | 
					
					
						
						| 
							 | 
						            revision=config.get("model_sha", "main"), | 
					
					
						
						| 
							 | 
						            num_params=model_params, | 
					
					
						
						| 
							 | 
						            model_size=model_size, | 
					
					
						
						| 
							 | 
						            group_size=group_size, | 
					
					
						
						| 
							 | 
						            result_file=result_file | 
					
					
						
						| 
							 | 
						        ) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    def update_with_request_file(self, requests_path): | 
					
					
						
						| 
							 | 
						        """Finds the relevant request file for the current model and updates info with it""" | 
					
					
						
						| 
							 | 
						        request_file = get_request_file_for_model(requests_path, self.full_model, | 
					
					
						
						| 
							 | 
						                self.quant_type.value.name, self.precision.value.name, | 
					
					
						
						| 
							 | 
						                self.weight_dtype.value.name, self.compute_dtype.value.name) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        try: | 
					
					
						
						| 
							 | 
						            with open(request_file, "r") as f: | 
					
					
						
						| 
							 | 
						                request = json.load(f) | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            self.date = request.get("submitted_time", "") | 
					
					
						
						| 
							 | 
						            self.architecture = request.get("architectures", "Unknown") | 
					
					
						
						| 
							 | 
						            self.status = request.get("status", "Failed") | 
					
					
						
						| 
							 | 
						        except Exception as e: | 
					
					
						
						| 
							 | 
						            print(requests_path, self.full_model, | 
					
					
						
						| 
							 | 
						                self.quant_type.value.name, self.precision.value.name, | 
					
					
						
						| 
							 | 
						                self.weight_dtype.value.name, self.compute_dtype.value.name) | 
					
					
						
						| 
							 | 
						            self.status = "Failed" | 
					
					
						
						| 
							 | 
						            print(f"Could not find request file for {self.org}/{self.model}") | 
					
					
						
						| 
							 | 
						            print(traceback.format_exc()) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    def update_with_dynamic_file_dict(self, file_dict): | 
					
					
						
						| 
							 | 
						        self.license = file_dict.get("license", "?") | 
					
					
						
						| 
							 | 
						        self.likes = file_dict.get("likes", 0) | 
					
					
						
						| 
							 | 
						        self.still_on_hub = file_dict["still_on_hub"] | 
					
					
						
						| 
							 | 
						        self.tags = file_dict.get("tags", []) | 
					
					
						
						| 
							 | 
						        self.flagged = any("flagged" in tag for tag in self.tags) | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    def to_dict(self): | 
					
					
						
						| 
							 | 
						        """Converts the Eval Result to a dict compatible with our dataframe display""" | 
					
					
						
						| 
							 | 
						        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        data_dict = { | 
					
					
						
						| 
							 | 
						            "eval_name": self.eval_name,   | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.precision.name: self.precision.value.name, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.quant_type.name: self.quant_type.value.name, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.model_type_symbol.name: self.quant_type.value.symbol, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.weight_dtype.name: self.weight_dtype.value.name, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.compute_dtype.name: self.compute_dtype.value.name, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.double_quant.name: self.double_quant, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.model_type.name: self.model_type.value.name, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.weight_type.name: self.weight_type.value.name, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.architecture.name: self.architecture, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.result_file), | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.dummy.name: self.full_model, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.revision.name: self.revision, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.average.name: average, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.license.name: self.license, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.likes.name: self.likes, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.params.name: self.num_params, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.model_size.name: self.model_size, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.group_size.name: self.group_size, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.still_on_hub.name: self.still_on_hub, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False, | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(), | 
					
					
						
						| 
							 | 
						            AutoEvalColumn.flagged.name: self.flagged | 
					
					
						
						| 
							 | 
						        } | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        for task in Tasks: | 
					
					
						
						| 
							 | 
						            data_dict[task.value.col_name] = self.results[task.value.benchmark] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        return data_dict | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def get_request_file_for_model(requests_path, model_name, | 
					
					
						
						| 
							 | 
						        quant_type, precision, weight_dtype, compute_dtype): | 
					
					
						
						| 
							 | 
						    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED""" | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    request_files = os.path.join( | 
					
					
						
						| 
							 | 
						        requests_path, | 
					
					
						
						| 
							 | 
						        f"{model_name}_eval_request_*.json", | 
					
					
						
						| 
							 | 
						    ) | 
					
					
						
						| 
							 | 
						    request_files = glob.glob(request_files) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    request_file = "" | 
					
					
						
						| 
							 | 
						    request_files = sorted(request_files, reverse=True) | 
					
					
						
						| 
							 | 
						    for tmp_request_file in request_files: | 
					
					
						
						| 
							 | 
						        with open(tmp_request_file, "r") as f: | 
					
					
						
						| 
							 | 
						            req_content = json.load(f) | 
					
					
						
						| 
							 | 
						            print(model_name, req_content["precision"], precision.split(".")[-1], str(req_content["quant_type"]), quant_type, req_content["weight_dtype"], weight_dtype.split(".")[-1],req_content["compute_dtype"], compute_dtype.split(".")[-1] ) | 
					
					
						
						| 
							 | 
						            if ( | 
					
					
						
						| 
							 | 
						                req_content["status"] in ["Finished"] | 
					
					
						
						| 
							 | 
						                and req_content["precision"] == precision.split(".")[-1] | 
					
					
						
						| 
							 | 
						                and str(req_content["quant_type"]) == quant_type | 
					
					
						
						| 
							 | 
						                and req_content["weight_dtype"] == weight_dtype.split(".")[-1] | 
					
					
						
						| 
							 | 
						                and req_content["compute_dtype"] == compute_dtype.split(".")[-1] | 
					
					
						
						| 
							 | 
						            ): | 
					
					
						
						| 
							 | 
						                request_file = tmp_request_file | 
					
					
						
						| 
							 | 
						            elif ( | 
					
					
						
						| 
							 | 
						                req_content["status"] in ["Finished"] | 
					
					
						
						| 
							 | 
						                and req_content["precision"] == precision.split(".")[-1] | 
					
					
						
						| 
							 | 
						                and quant_type == "AutoRound" | 
					
					
						
						| 
							 | 
						                and req_content["weight_dtype"] == weight_dtype.split(".")[-1] | 
					
					
						
						| 
							 | 
						                and req_content["compute_dtype"] == compute_dtype.split(".")[-1] | 
					
					
						
						| 
							 | 
						            ): | 
					
					
						
						| 
							 | 
						                request_file = tmp_request_file | 
					
					
						
						| 
							 | 
						    return request_file | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]: | 
					
					
						
						| 
							 | 
						    """From the path of the results folder root, extract all needed info for results""" | 
					
					
						
						| 
							 | 
						    model_result_filepaths = [] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    for root, _, files in os.walk(results_path): | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if len(files) == 0 or any([not f.endswith(".json") for f in files]): | 
					
					
						
						| 
							 | 
						            continue | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        try: | 
					
					
						
						| 
							 | 
						            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7]) | 
					
					
						
						| 
							 | 
						        except dateutil.parser._parser.ParserError: | 
					
					
						
						| 
							 | 
						            files = [files[-1]] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        for file in files: | 
					
					
						
						| 
							 | 
						            model_result_filepaths.append(os.path.join(root, file)) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    with open(dynamic_path) as f: | 
					
					
						
						| 
							 | 
						        dynamic_data = json.load(f) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    eval_results = {} | 
					
					
						
						| 
							 | 
						    for model_result_filepath in model_result_filepaths: | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        eval_result = EvalResult.init_from_json_file(model_result_filepath) | 
					
					
						
						| 
							 | 
						        eval_result.update_with_request_file(requests_path) | 
					
					
						
						| 
							 | 
						        if eval_result.full_model in dynamic_data: | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            if "meta-llama" in eval_result.full_model:  | 
					
					
						
						| 
							 | 
						                eval_result.still_on_hub = True | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        eval_name = eval_result.eval_name | 
					
					
						
						| 
							 | 
						        if eval_name in eval_results.keys(): | 
					
					
						
						| 
							 | 
						            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None}) | 
					
					
						
						| 
							 | 
						        else: | 
					
					
						
						| 
							 | 
						            eval_results[eval_name] = eval_result | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    results = [] | 
					
					
						
						| 
							 | 
						    for v in eval_results.values(): | 
					
					
						
						| 
							 | 
						        try: | 
					
					
						
						| 
							 | 
						            if v.status == "Finished": | 
					
					
						
						| 
							 | 
						                v.to_dict()  | 
					
					
						
						| 
							 | 
						                results.append(v) | 
					
					
						
						| 
							 | 
						        except KeyError:   | 
					
					
						
						| 
							 | 
						            continue | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    return results | 
					
					
						
						| 
							 | 
						
 |