David Pomerenke
commited on
Commit
·
ce2acb0
1
Parent(s):
9e3bc4f
Add Global MMLU benchmark
Browse files- README.md +1 -0
- evals/backend.py +1 -1
- evals/main.py +4 -3
- evals/models.py +1 -1
- evals/tasks.py +41 -1
- results.json +0 -0
README.md
CHANGED
|
@@ -11,6 +11,7 @@ datasets:
|
|
| 11 |
- openlanguagedata/flores_plus
|
| 12 |
- google/fleurs
|
| 13 |
- mozilla-foundation/common_voice_1_0
|
|
|
|
| 14 |
models:
|
| 15 |
- meta-llama/Llama-3.3-70B-Instruct
|
| 16 |
- mistralai/Mistral-Small-24B-Instruct-2501
|
|
|
|
| 11 |
- openlanguagedata/flores_plus
|
| 12 |
- google/fleurs
|
| 13 |
- mozilla-foundation/common_voice_1_0
|
| 14 |
+
- CohereForAI/Global-MMLU
|
| 15 |
models:
|
| 16 |
- meta-llama/Llama-3.3-70B-Instruct
|
| 17 |
- mistralai/Mistral-Small-24B-Instruct-2501
|
evals/backend.py
CHANGED
|
@@ -22,7 +22,7 @@ def mean(lst):
|
|
| 22 |
return sum(lst) / len(lst) if lst else None
|
| 23 |
|
| 24 |
|
| 25 |
-
task_metrics = ["translation_from_bleu", "translation_to_bleu", "classification_accuracy"]
|
| 26 |
|
| 27 |
|
| 28 |
def make_model_table(df, models):
|
|
|
|
| 22 |
return sum(lst) / len(lst) if lst else None
|
| 23 |
|
| 24 |
|
| 25 |
+
task_metrics = ["translation_from_bleu", "translation_to_bleu", "classification_accuracy", "mmlu_accuracy"]
|
| 26 |
|
| 27 |
|
| 28 |
def make_model_table(df, models):
|
evals/main.py
CHANGED
|
@@ -12,6 +12,8 @@ from tasks import tasks
|
|
| 12 |
# ===== config =====
|
| 13 |
|
| 14 |
n_sentences = 10
|
|
|
|
|
|
|
| 15 |
|
| 16 |
# ===== run evaluation and aggregate results =====
|
| 17 |
|
|
@@ -22,9 +24,8 @@ async def evaluate():
|
|
| 22 |
task(model, lang.bcp_47, i)
|
| 23 |
for task in tasks
|
| 24 |
for i in range(n_sentences)
|
| 25 |
-
for lang in languages.iloc[:
|
| 26 |
-
for model in models["id"]
|
| 27 |
-
if lang.in_benchmark
|
| 28 |
]
|
| 29 |
return await tqdm_asyncio.gather(*results, miniters=1)
|
| 30 |
|
|
|
|
| 12 |
# ===== config =====
|
| 13 |
|
| 14 |
n_sentences = 10
|
| 15 |
+
n_languages = 3
|
| 16 |
+
n_models = 3
|
| 17 |
|
| 18 |
# ===== run evaluation and aggregate results =====
|
| 19 |
|
|
|
|
| 24 |
task(model, lang.bcp_47, i)
|
| 25 |
for task in tasks
|
| 26 |
for i in range(n_sentences)
|
| 27 |
+
for lang in languages.iloc[:n_languages].itertuples()
|
| 28 |
+
for model in models["id"].iloc[:n_models]
|
|
|
|
| 29 |
]
|
| 30 |
return await tqdm_asyncio.gather(*results, miniters=1)
|
| 31 |
|
evals/models.py
CHANGED
|
@@ -116,7 +116,7 @@ async def transcribe(path, model="elevenlabs/scribe_v1"):
|
|
| 116 |
raise ValueError(f"Model {model} not supported")
|
| 117 |
|
| 118 |
|
| 119 |
-
models = pd.DataFrame(models, columns=["id"])
|
| 120 |
|
| 121 |
|
| 122 |
@cache
|
|
|
|
| 116 |
raise ValueError(f"Model {model} not supported")
|
| 117 |
|
| 118 |
|
| 119 |
+
models = pd.DataFrame(models, columns=["id"])
|
| 120 |
|
| 121 |
|
| 122 |
@cache
|
evals/tasks.py
CHANGED
|
@@ -8,6 +8,7 @@ from datasets_.flores import flores_sentences
|
|
| 8 |
from joblib.memory import Memory
|
| 9 |
from languages import languages, script_name
|
| 10 |
from models import complete, transcribe
|
|
|
|
| 11 |
|
| 12 |
cache = Memory(location=".cache", verbose=0).cache
|
| 13 |
bleu = evaluate.load("bleu")
|
|
@@ -185,6 +186,45 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
|
|
| 185 |
}
|
| 186 |
]
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
@cache
|
| 190 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
|
@@ -217,11 +257,11 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
|
| 217 |
}
|
| 218 |
]
|
| 219 |
|
| 220 |
-
|
| 221 |
tasks = [
|
| 222 |
partial(translate_and_evaluate, mode="from"),
|
| 223 |
partial(translate_and_evaluate, mode="to"),
|
| 224 |
classify_and_evaluate,
|
| 225 |
# mlm_and_evaluate,
|
|
|
|
| 226 |
# transcribe_and_evaluate,
|
| 227 |
]
|
|
|
|
| 8 |
from joblib.memory import Memory
|
| 9 |
from languages import languages, script_name
|
| 10 |
from models import complete, transcribe
|
| 11 |
+
from datasets import load_dataset
|
| 12 |
|
| 13 |
cache = Memory(location=".cache", verbose=0).cache
|
| 14 |
bleu = evaluate.load("bleu")
|
|
|
|
| 186 |
}
|
| 187 |
]
|
| 188 |
|
| 189 |
+
@cache
|
| 190 |
+
def _load_dataset(dataset, subset):
|
| 191 |
+
return load_dataset(dataset, subset)
|
| 192 |
+
|
| 193 |
+
@cache
|
| 194 |
+
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
| 195 |
+
data = _load_dataset("CohereForAI/Global-MMLU", language_bcp_47)
|
| 196 |
+
item = data["test"][nr]
|
| 197 |
+
def format_item(item):
|
| 198 |
+
return f"""{item['question']}
|
| 199 |
+
|
| 200 |
+
A: {item['option_a']}
|
| 201 |
+
B: {item['option_b']}
|
| 202 |
+
C: {item['option_c']}
|
| 203 |
+
D: {item['option_d']}
|
| 204 |
+
|
| 205 |
+
A|B|C|D?"""
|
| 206 |
+
messages = []
|
| 207 |
+
for example in data["dev"].select(range(5)):
|
| 208 |
+
messages += [{"role": "user", "content": format_item(example)}, {"role": "assistant", "content": example["answer"]}]
|
| 209 |
+
messages += [{"role": "user", "content": format_item(item)}]
|
| 210 |
+
reply = await complete(
|
| 211 |
+
model=model,
|
| 212 |
+
messages=messages,
|
| 213 |
+
temperature=0,
|
| 214 |
+
max_tokens=1,
|
| 215 |
+
)
|
| 216 |
+
print(reply.choices[0].message.content.strip())
|
| 217 |
+
acc = int(reply.choices[0].message.content.strip() == item["answer"])
|
| 218 |
+
return [
|
| 219 |
+
{
|
| 220 |
+
"model": model,
|
| 221 |
+
"bcp_47": language_bcp_47,
|
| 222 |
+
"task": "mmlu",
|
| 223 |
+
"metric": "accuracy",
|
| 224 |
+
"score": acc,
|
| 225 |
+
"sentence_nr": nr,
|
| 226 |
+
}
|
| 227 |
+
]
|
| 228 |
|
| 229 |
@cache
|
| 230 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
|
|
|
| 257 |
}
|
| 258 |
]
|
| 259 |
|
|
|
|
| 260 |
tasks = [
|
| 261 |
partial(translate_and_evaluate, mode="from"),
|
| 262 |
partial(translate_and_evaluate, mode="to"),
|
| 263 |
classify_and_evaluate,
|
| 264 |
# mlm_and_evaluate,
|
| 265 |
+
mmlu_and_evaluate,
|
| 266 |
# transcribe_and_evaluate,
|
| 267 |
]
|
results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|