Spaces:
Running
Running
File size: 17,994 Bytes
9ae8d89 0a14325 930ed8c 20dad4a 3df6003 9ae8d89 0a14325 9ae8d89 0da5ee3 9ae8d89 09b313f 0da5ee3 553b217 20dad4a fb84311 d83f3a1 09b313f 9ae8d89 d8147b8 33739cc 9ae8d89 d8147b8 d83f3a1 c92b14d 09b313f 0da5ee3 0a14325 ba515db 553b217 fb84311 d83f3a1 fb84311 d8147b8 9ae8d89 d8147b8 9ae8d89 d8147b8 9ae8d89 d8147b8 9ae8d89 09b313f 9ae8d89 0a14325 9ae8d89 b3eff40 d86ca68 0a14325 553b217 fb84311 9ae8d89 09b313f 9ae8d89 b3eff40 9ae8d89 b3eff40 d8147b8 b3eff40 9ae8d89 b3eff40 09b313f b3eff40 e1cdc4b b3eff40 d8147b8 b3eff40 9ae8d89 09b313f 9ae8d89 d8147b8 09b313f 9ae8d89 671e1a6 9ae8d89 09b313f 9ae8d89 671e1a6 9ae8d89 09b313f 9ae8d89 09b313f 9ae8d89 0a14325 553b217 fb84311 0da5ee3 d83f3a1 4b6eb81 d83f3a1 ffab9c0 572763d e4bb82d 09b313f 9ae8d89 0a14325 09b313f 0da5ee3 0a14325 553b217 fb84311 09b313f d83f3a1 09b313f 6616540 09b313f 3df6003 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 |
from dataclasses import dataclass, make_dataclass
from enum import Enum
import pandas as pd
# changes to be made here
from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns, ClosedEndedMultilingualColumns, OpenEndedArabicColumn, OpenEndedFrenchColumn, OpenEndedSpanishColumn, OpenEndedPortugueseColumn, OpenEndedRomanianColumn, OpenEndedGreekColumn
from src.envs import PRIVATE_REPO
import json
import gradio as gr
def fields(raw_class):
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
# These classes are for user facing column names,
# to avoid having to change them all around the code
# when a modif is needed
@dataclass
class ColumnContent:
# changes to be made here
name: str
type: str
displayed_by_default: bool
hidden: bool = False
invariant: bool = True
never_hidden: bool = False
dataset_task_col: bool = False
open_ended_col: bool = False
med_safety_col: bool = False
medical_summarization_col: bool = False
aci_col: bool = False
soap_col: bool = False
closed_ended_arabic_col: bool = False
healthbench_col: bool = False
healthbench_hard_col: bool = False
open_ended_arabic_col: bool = False
open_ended_french_col: bool = False
open_ended_spanish_col: bool = False
open_ended_portuguese_col: bool = False
open_ended_romanian_col: bool = False
open_ended_greek_col: bool = False
closed_ended_multilingual_col: bool = False
## Leaderboard columns
# Init
auto_eval_column_dict = []
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, closed_ended_multilingual_col=True, invariant=False)])
auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
for task in HarnessTasks:
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
for column in OpenEndedColumns:
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_col=True, invariant=False)])
# changes to be made here
for column in MedSafetyColumns:
if column.value.col_name == "95% CI" or column.value.col_name == "Harmfulness Score":
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, med_safety_col=True, invariant=False)])
else:
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, med_safety_col=True, invariant=False)])
for column in MedicalSummarizationColumns:
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, medical_summarization_col=True, invariant=False)])
for column in ACIColumns:
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, aci_col=True, invariant=False)])
for column in SOAPColumns:
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, soap_col=True, invariant=False)])
for column in HealthbenchColumns:
if column.value.col_name.startswith("Axis"):
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, healthbench_col=True, invariant=False)])
else:
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, healthbench_col=True, invariant=False)])
for column in HealthbenchHardColumns:
if column.value.col_name.startswith("Axis"):
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, healthbench_hard_col=True, invariant=False)])
else:
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, healthbench_hard_col=True, invariant=False)])
for column in OpenEndedArabicColumn:
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_arabic_col=True, invariant=False)])
for column in OpenEndedFrenchColumn:
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_french_col=True, invariant=False)])
for column in OpenEndedSpanishColumn:
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_spanish_col=True, invariant=False)])
for column in OpenEndedPortugueseColumn:
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_portuguese_col=True, invariant=False)])
for column in OpenEndedRomanianColumn:
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_romanian_col=True, invariant=False)])
for column in OpenEndedGreekColumn:
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_greek_col=True, invariant=False)])
for column in ClosedEndedMultilingualColumns:
auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, closed_ended_multilingual_col=True, invariant=False)])
auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False)])
# auto_eval_column_dict.append(["backbone", ColumnContent, ColumnContent("Base Model", "str", False)])
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub β€οΈ", "number", False)])
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, True)])
# auto_eval_column_dict.append(["display_result", ColumnContent, ColumnContent("Display Result", "bool", False, True)])
auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("Submission Date", "str", False)])
# We use make dataclass to dynamically fill the scores from Tasks
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
## For the queue columns in the submission tab
# changes to be made here
@dataclass(frozen=True)
class EvalQueueColumn: # Queue column
model = ColumnContent("model", "markdown", True)
revision = ColumnContent("revision", "str", True)
private = ColumnContent("private", "bool", True)
model_type = ColumnContent("model_type", "str", True)
precision = ColumnContent("precision", "str", True)
weight_type = ColumnContent("weight_type", "str", "Original")
closed_ended_status = ColumnContent("closed_ended_status", "str", True)
open_ended_status = ColumnContent("open_ended_status", "str", True)
med_safety_status = ColumnContent("med_safety_status", "str", True)
medical_summarization_status = ColumnContent("medical_summarization_status", "str", True)
note_generation_status = ColumnContent("note_generation_status", "str", True)
## All the model information that we might need
@dataclass
class ModelDetails:
name: str
display_name: str = ""
symbol: str = "" # emoji
class ModelType(Enum):
# ZEROSHOT = ModelDetails(name="zero-shot", symbol="β«")
# FINETUNED = ModelDetails(name="fine-tuned", symbol="βͺ")
PT = ModelDetails(name="pretrained", symbol="π’")
# FT = ModelDetails(name="fine-tuned", symbol="πΆ")
# DS = ModelDetails(name="domain-specific", symbol="π₯")
IFT = ModelDetails(name="instruction-tuned", symbol="β")
RL = ModelDetails(name="preference-tuned", symbol="π¦")
Unknown = ModelDetails(name="", symbol="?")
def to_str(self, separator=" "):
return f"{self.value.symbol}{separator}{self.value.name}"
@staticmethod
def from_str(type):
# if "zero-shot" in type or "β«" in type:
# return ModelType.ZEROSHOT
# if "fine-tuned" in type or "βͺ" in type:
# return ModelType.FINETUNED
# if "fine-tuned" in type or "πΆ" in type:
# return ModelType.FT
if "pretrained" in type or "π’" in type:
return ModelType.PT
if "preference-tuned" in type or "π¦" in type:
return ModelType.RL
if "instruction-tuned" in type or "β" in type:
return ModelType.IFT
# if "domain-specific" in type or "π₯" in type:
# return ModelType.DS
return ModelType.Unknown
class ModelArch(Enum):
Encoder = ModelDetails("Encoder")
Decoder = ModelDetails("Decoder")
GLiNEREncoder = ModelDetails("GLiNER Encoder")
Unknown = ModelDetails(name="Other", symbol="?")
def to_str(self, separator=" "):
return f"{self.value.name}"
@staticmethod
def from_str(type):
if "Encoder" == type:
return ModelArch.Encoder
if "Decoder" == type:
return ModelArch.Decoder
if "GLiNER Encoder" == type:
return ModelArch.GLiNEREncoder
# if "unknown" in type:
# return ModelArch.Unknown
return ModelArch.Unknown
class WeightType(Enum):
Adapter = ModelDetails("Adapter")
Original = ModelDetails("Original")
Delta = ModelDetails("Delta")
Unknown = ModelDetails("?")
def from_str(wt):
if "original" in wt.lower():
return WeightType.Original
if "adapter" in wt.lower():
return WeightType.Adapter
if "delta" in wt.lower():
return WeightType.Delta
return WeightType.Unknown
class Precision(Enum):
auto = ModelDetails("auto")
float16 = ModelDetails("float16")
bfloat16 = ModelDetails("bfloat16")
float32 = ModelDetails("float32")
# qt_8bit = ModelDetails("8bit")
# qt_4bit = ModelDetails("4bit")
# qt_GPTQ = ModelDetails("GPTQ")
Unknown = ModelDetails("?")
def from_str(precision):
if precision in ["auto"]:
return Precision.auto
if precision in ["torch.float16", "float16"]:
return Precision.float16
if precision in ["torch.bfloat16", "bfloat16"]:
return Precision.bfloat16
if precision in ["float32"]:
return Precision.float32
# if precision in ["8bit"]:
# return Precision.qt_8bit
# if precision in ["4bit"]:
# return Precision.qt_4bit
# if precision in ["GPTQ", "None"]:
# return Precision.qt_GPTQ
return Precision.Unknown
class PromptTemplateName(Enum):
UniversalNERTemplate = "universal_ner"
LLMHTMLHighlightedSpansTemplate = "llm_html_highlighted_spans"
LLMHTMLHighlightedSpansTemplateV1 = "llm_html_highlighted_spans_v1"
LLamaNERTemplate = "llama_70B_ner"
# MixtralNERTemplate = "mixtral_ner_v0.3"
class EvaluationMetrics(Enum):
SpanBased = "Span Based"
TokenBased = "Token Based"
# Column selection
# changes to be made here
DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.dataset_task_col or c.invariant)]
OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_col or c.invariant)]
MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.med_safety_col or c.invariant)]
MEDICAL_SUMMARIZATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.medical_summarization_col or c.invariant)]
ACI_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.aci_col or c.invariant)]
SOAP_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.soap_col or c.invariant)]
HEALTHBENCH_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.healthbench_col or c.invariant)]
HEALTHBENCH_HARD_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.healthbench_hard_col or c.invariant)]
OpenEndedArabic_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_arabic_col or c.invariant)]
OpenEndedFrench_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_french_col or c.invariant)]
OpenEndedSpanish_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_spanish_col or c.invariant)]
OpenEndedPortuguese_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_portuguese_col or c.invariant)]
OpenEndedRomanian_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_romanian_col or c.invariant)]
OpenEndedGreek_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_greek_col or c.invariant)]
ClosedEndedMultilingual_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.closed_ended_multilingual_col or c.invariant)]
# if PRIVATE_REPO:
#CLOSED_ENDED_ARABIC_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.closed_ended_arabic_col or c.invariant)]
# CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.cross_examination_col or c.invariant)]
# DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.cross_examination_col]
# OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col and not c.med_safety_col and not c.cross_examination_col]
# MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.dataset_task_col and not c.cross_examination_col]
# CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.dataset_task_col]
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
# changes to be made here
DATASET_BENCHMARK_COLS = [t.value.col_name for t in HarnessTasks]
OPEN_ENDED_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedColumns]
MED_SAFETY_BENCHMARK_COLS = [t.value.col_name for t in MedSafetyColumns]
MEDICAL_SUMMARIZATION_BENCHMARK_COLS = [t.value.col_name for t in MedicalSummarizationColumns]
ACI_BENCHMARK_COLS = [t.value.col_name for t in ACIColumns]
SOAP_BENCHMARK_COLS = [t.value.col_name for t in SOAPColumns]
HEALTHBENCH_BENCHMARK_COLS = [t.value.col_name for t in HealthbenchColumns]
HEALTHBENCH_HARD_BENCHMARK_COLS = [t.value.col_name for t in HealthbenchHardColumns]
#changed this
OpenEndedArabic_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedArabicColumn]
OpenEndedFrench_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedFrenchColumn]
OpenEndedPortuguese_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedPortugueseColumn]
OpenEndedSpanish_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedSpanishColumn]
OpenEndedRomanian_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedRomanianColumn]
OpenEndedGreek_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedGreekColumn]
ClosedEndedMultilingual_BENCHMARK_COLS = [t.value.col_name for t in ClosedEndedMultilingualColumns]
NUMERIC_INTERVALS = {
"?": pd.Interval(-100, 0, closed="right"),
"~1.5": pd.Interval(0, 2, closed="right"),
"~3": pd.Interval(2, 4, closed="right"),
"~7": pd.Interval(4, 9, closed="right"),
"~13": pd.Interval(9, 20, closed="right"),
"~35": pd.Interval(20, 45, closed="right"),
"~60": pd.Interval(45, 70, closed="right"),
"70+": pd.Interval(70, 10000, closed="right"),
}
def render_generation_templates(task: str, generation_type: str):
with open("src/display/templates/system_prompts.json", "r") as f:
system_prompt = json.load(f)[f"{task}+_+{generation_type}"]
with open(f"src/display/templates/{task}+_+{generation_type}.jinja", "r") as f:
user_prompt = f.read()
system_prompt_textbox = gr.Textbox(
value=system_prompt,
label="System Prompt",
lines=2,
elem_id=f"system-prompt-textbox-{task}-{generation_type}",
show_copy_button=True,
)
user_prompt_textbox = gr.Textbox(
value=user_prompt,
label="User Prompt",
lines=15,
elem_id=f"user-prompt-textbox-{task}-{generation_type}",
show_copy_button=True,
)
return system_prompt_textbox, user_prompt_textbox
# return None, None |