Upload from GitHub Actions: updated translation functions
Browse files- evals/datasets_/mmlu.py +13 -4
- evals/datasets_/truthfulqa.py +52 -16
- evals/models.py +5 -1
evals/datasets_/mmlu.py
CHANGED
@@ -111,6 +111,7 @@ def print_datasets_analysis():
|
|
111 |
# MMLUX is translated using DeepL
|
112 |
# Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
|
113 |
|
|
|
114 |
# print_datasets_analysis()
|
115 |
|
116 |
|
@@ -195,7 +196,13 @@ async def load_mmlu_translated(language_bcp_47, nr):
|
|
195 |
filtered = ds["test"].filter(lambda x: x["subject"] == category)
|
196 |
if len(filtered) == 0:
|
197 |
return None, None, None
|
198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
200 |
# Translate question and choices
|
201 |
question_translated = await translate_google(task["question"], "en", language_bcp_47)
|
@@ -226,7 +233,7 @@ def translate_mmlu(languages):
|
|
226 |
for lang in languages["bcp_47"].values[:150]
|
227 |
if lang not in human_translated and lang in get_google_supported_languages()
|
228 |
]
|
229 |
-
n_samples =
|
230 |
|
231 |
slug = "fair-forward/mmlu-autotranslated"
|
232 |
for lang in tqdm(untranslated):
|
@@ -242,8 +249,10 @@ def translate_mmlu(languages):
|
|
242 |
if split == "dev":
|
243 |
samples.extend(ds.filter(lambda x: x["subject"] == category))
|
244 |
else:
|
245 |
-
|
246 |
-
|
|
|
|
|
247 |
samples.append(task)
|
248 |
questions_tr = [
|
249 |
translate_google(s["question"], "en", lang) for s in samples
|
|
|
111 |
# MMLUX is translated using DeepL
|
112 |
# Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
|
113 |
|
114 |
+
|
115 |
# print_datasets_analysis()
|
116 |
|
117 |
|
|
|
196 |
filtered = ds["test"].filter(lambda x: x["subject"] == category)
|
197 |
if len(filtered) == 0:
|
198 |
return None, None, None
|
199 |
+
|
200 |
+
# Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
|
201 |
+
if nr < 20:
|
202 |
+
task = filtered[nr] # Direct mapping to same sample
|
203 |
+
else:
|
204 |
+
# Fallback to sequential if nr exceeds our sample count
|
205 |
+
task = filtered[nr % len(filtered)]
|
206 |
|
207 |
# Translate question and choices
|
208 |
question_translated = await translate_google(task["question"], "en", language_bcp_47)
|
|
|
233 |
for lang in languages["bcp_47"].values[:150]
|
234 |
if lang not in human_translated and lang in get_google_supported_languages()
|
235 |
]
|
236 |
+
n_samples = 20
|
237 |
|
238 |
slug = "fair-forward/mmlu-autotranslated"
|
239 |
for lang in tqdm(untranslated):
|
|
|
249 |
if split == "dev":
|
250 |
samples.extend(ds.filter(lambda x: x["subject"] == category))
|
251 |
else:
|
252 |
+
# Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
|
253 |
+
filtered = ds.filter(lambda x: x["subject"] == category)
|
254 |
+
for i in range(min(n_samples, len(filtered))):
|
255 |
+
task = filtered[i]
|
256 |
samples.append(task)
|
257 |
questions_tr = [
|
258 |
translate_google(s["question"], "en", lang) for s in samples
|
evals/datasets_/truthfulqa.py
CHANGED
@@ -14,11 +14,21 @@ from models import translate_google, get_google_supported_languages
|
|
14 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
15 |
|
16 |
slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
|
|
|
|
|
17 |
tags_uhura_truthfulqa = {
|
18 |
standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
|
19 |
if a.endswith("multiple_choice")
|
20 |
}
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def add_choices(row):
|
24 |
row["choices"] = row["mc1_targets"]["choices"]
|
@@ -34,6 +44,15 @@ async def load_truthfulqa(language_bcp_47, nr):
|
|
34 |
ds = ds.map(add_choices)
|
35 |
task = ds["test"][nr]
|
36 |
return "masakhane/uhura-truthfulqa", task, "human"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
else:
|
38 |
# Fallback to on-the-fly translation for missing languages/samples
|
39 |
return await load_truthfulqa_translated(language_bcp_47, nr)
|
@@ -52,7 +71,13 @@ async def load_truthfulqa_translated(language_bcp_47, nr):
|
|
52 |
# Load English TruthfulQA data
|
53 |
ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"])
|
54 |
ds = ds.map(add_choices)
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
# Translate question and choices
|
58 |
question_translated = await translate_google(task["question"], "en", language_bcp_47)
|
@@ -84,6 +109,9 @@ def translate_truthfulqa(languages):
|
|
84 |
]
|
85 |
n_samples = 20
|
86 |
|
|
|
|
|
|
|
87 |
slug = "fair-forward/truthfulqa-autotranslated"
|
88 |
for lang in tqdm(untranslated):
|
89 |
# check if already exists on hub
|
@@ -97,32 +125,40 @@ def translate_truthfulqa(languages):
|
|
97 |
if split == "train":
|
98 |
samples.extend(ds)
|
99 |
else:
|
100 |
-
|
|
|
101 |
task = ds[i]
|
102 |
samples.append(task)
|
|
|
|
|
103 |
questions_tr = [
|
104 |
translate_google(s["question"], "en", lang) for s in samples
|
105 |
]
|
106 |
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
|
107 |
-
|
|
|
|
|
|
|
|
|
108 |
for s in samples:
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
choices_tr
|
118 |
-
|
|
|
|
|
119 |
|
120 |
ds_lang = Dataset.from_dict(
|
121 |
{
|
122 |
-
"subject": [s["subject"] for s in samples],
|
123 |
"question": questions_tr,
|
124 |
-
"choices":
|
125 |
-
"
|
126 |
}
|
127 |
)
|
128 |
ds_lang.push_to_hub(
|
|
|
14 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
15 |
|
16 |
slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
|
17 |
+
slug_truthfulqa_autotranslated = "fair-forward/truthfulqa-autotranslated"
|
18 |
+
|
19 |
tags_uhura_truthfulqa = {
|
20 |
standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
|
21 |
if a.endswith("multiple_choice")
|
22 |
}
|
23 |
|
24 |
+
# Get available auto-translated languages
|
25 |
+
try:
|
26 |
+
tags_truthfulqa_autotranslated = {
|
27 |
+
standardize_tag(a, macro=True): a for a in _get_dataset_config_names(slug_truthfulqa_autotranslated)
|
28 |
+
}
|
29 |
+
except Exception:
|
30 |
+
tags_truthfulqa_autotranslated = {}
|
31 |
+
|
32 |
|
33 |
def add_choices(row):
|
34 |
row["choices"] = row["mc1_targets"]["choices"]
|
|
|
44 |
ds = ds.map(add_choices)
|
45 |
task = ds["test"][nr]
|
46 |
return "masakhane/uhura-truthfulqa", task, "human"
|
47 |
+
elif language_bcp_47 in tags_truthfulqa_autotranslated.keys():
|
48 |
+
# Load from auto-translated dataset (same samples as translation)
|
49 |
+
ds = _load_dataset(slug_truthfulqa_autotranslated, language_bcp_47)
|
50 |
+
test_split = ds["test"] if "test" in ds else ds
|
51 |
+
if nr < len(test_split):
|
52 |
+
task = test_split[nr]
|
53 |
+
return slug_truthfulqa_autotranslated, task, "machine"
|
54 |
+
# If requested index exceeds stored sample count, fall back to on-the-fly
|
55 |
+
return await load_truthfulqa_translated(language_bcp_47, nr)
|
56 |
else:
|
57 |
# Fallback to on-the-fly translation for missing languages/samples
|
58 |
return await load_truthfulqa_translated(language_bcp_47, nr)
|
|
|
71 |
# Load English TruthfulQA data
|
72 |
ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"])
|
73 |
ds = ds.map(add_choices)
|
74 |
+
|
75 |
+
# Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
|
76 |
+
if nr < 20:
|
77 |
+
task = ds["test"][nr] # Direct mapping to same sample
|
78 |
+
else:
|
79 |
+
# Fallback to sequential if nr exceeds our sample count
|
80 |
+
task = ds["test"][nr % len(ds["test"])]
|
81 |
|
82 |
# Translate question and choices
|
83 |
question_translated = await translate_google(task["question"], "en", language_bcp_47)
|
|
|
109 |
]
|
110 |
n_samples = 20
|
111 |
|
112 |
+
# Set fixed seed for consistent sample selection across all languages
|
113 |
+
random.seed(42)
|
114 |
+
|
115 |
slug = "fair-forward/truthfulqa-autotranslated"
|
116 |
for lang in tqdm(untranslated):
|
117 |
# check if already exists on hub
|
|
|
125 |
if split == "train":
|
126 |
samples.extend(ds)
|
127 |
else:
|
128 |
+
# Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
|
129 |
+
for i in range(min(n_samples, len(ds))):
|
130 |
task = ds[i]
|
131 |
samples.append(task)
|
132 |
+
|
133 |
+
# Translate questions
|
134 |
questions_tr = [
|
135 |
translate_google(s["question"], "en", lang) for s in samples
|
136 |
]
|
137 |
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
|
138 |
+
|
139 |
+
# Translate choices for each sample
|
140 |
+
all_choices_tr = []
|
141 |
+
all_labels = []
|
142 |
+
|
143 |
for s in samples:
|
144 |
+
# Get choices from mc1_targets
|
145 |
+
choices = s["mc1_targets"]["choices"]
|
146 |
+
labels = s["mc1_targets"]["labels"]
|
147 |
+
|
148 |
+
# Translate choices
|
149 |
+
choices_tr = [
|
150 |
+
translate_google(choice, "en", lang) for choice in choices
|
151 |
+
]
|
152 |
+
choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
|
153 |
+
|
154 |
+
all_choices_tr.append(choices_tr)
|
155 |
+
all_labels.append(labels)
|
156 |
|
157 |
ds_lang = Dataset.from_dict(
|
158 |
{
|
|
|
159 |
"question": questions_tr,
|
160 |
+
"choices": all_choices_tr,
|
161 |
+
"labels": all_labels,
|
162 |
}
|
163 |
)
|
164 |
ds_lang.push_to_hub(
|
evals/models.py
CHANGED
@@ -8,7 +8,11 @@ from os import getenv
|
|
8 |
import pandas as pd
|
9 |
from aiolimiter import AsyncLimiter
|
10 |
from dotenv import load_dotenv
|
11 |
-
|
|
|
|
|
|
|
|
|
12 |
from google.cloud import translate_v2 as translate
|
13 |
from huggingface_hub import AsyncInferenceClient, HfApi
|
14 |
from joblib.memory import Memory
|
|
|
8 |
import pandas as pd
|
9 |
from aiolimiter import AsyncLimiter
|
10 |
from dotenv import load_dotenv
|
11 |
+
# Make ElevenLabs optional to avoid hard dependency when not using speech tasks
|
12 |
+
try:
|
13 |
+
from elevenlabs import AsyncElevenLabs
|
14 |
+
except Exception: # ImportError or other env-specific issues
|
15 |
+
AsyncElevenLabs = None
|
16 |
from google.cloud import translate_v2 as translate
|
17 |
from huggingface_hub import AsyncInferenceClient, HfApi
|
18 |
from joblib.memory import Memory
|