davidpomerenke commited on
Commit
8f5ce26
·
verified ·
1 Parent(s): b8cbeff

Upload from GitHub Actions: updated translation functions

Browse files
evals/datasets_/mmlu.py CHANGED
@@ -111,6 +111,7 @@ def print_datasets_analysis():
111
  # MMLUX is translated using DeepL
112
  # Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
113
 
 
114
  # print_datasets_analysis()
115
 
116
 
@@ -195,7 +196,13 @@ async def load_mmlu_translated(language_bcp_47, nr):
195
  filtered = ds["test"].filter(lambda x: x["subject"] == category)
196
  if len(filtered) == 0:
197
  return None, None, None
198
- task = filtered[nr % len(filtered)]
 
 
 
 
 
 
199
 
200
  # Translate question and choices
201
  question_translated = await translate_google(task["question"], "en", language_bcp_47)
@@ -226,7 +233,7 @@ def translate_mmlu(languages):
226
  for lang in languages["bcp_47"].values[:150]
227
  if lang not in human_translated and lang in get_google_supported_languages()
228
  ]
229
- n_samples = 10
230
 
231
  slug = "fair-forward/mmlu-autotranslated"
232
  for lang in tqdm(untranslated):
@@ -242,8 +249,10 @@ def translate_mmlu(languages):
242
  if split == "dev":
243
  samples.extend(ds.filter(lambda x: x["subject"] == category))
244
  else:
245
- for i in range(n_samples):
246
- task = ds.filter(lambda x: x["subject"] == category)[i]
 
 
247
  samples.append(task)
248
  questions_tr = [
249
  translate_google(s["question"], "en", lang) for s in samples
 
111
  # MMLUX is translated using DeepL
112
  # Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
113
 
114
+
115
  # print_datasets_analysis()
116
 
117
 
 
196
  filtered = ds["test"].filter(lambda x: x["subject"] == category)
197
  if len(filtered) == 0:
198
  return None, None, None
199
+
200
+ # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
201
+ if nr < 20:
202
+ task = filtered[nr] # Direct mapping to same sample
203
+ else:
204
+ # Fallback to sequential if nr exceeds our sample count
205
+ task = filtered[nr % len(filtered)]
206
 
207
  # Translate question and choices
208
  question_translated = await translate_google(task["question"], "en", language_bcp_47)
 
233
  for lang in languages["bcp_47"].values[:150]
234
  if lang not in human_translated and lang in get_google_supported_languages()
235
  ]
236
+ n_samples = 20
237
 
238
  slug = "fair-forward/mmlu-autotranslated"
239
  for lang in tqdm(untranslated):
 
249
  if split == "dev":
250
  samples.extend(ds.filter(lambda x: x["subject"] == category))
251
  else:
252
+ # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
253
+ filtered = ds.filter(lambda x: x["subject"] == category)
254
+ for i in range(min(n_samples, len(filtered))):
255
+ task = filtered[i]
256
  samples.append(task)
257
  questions_tr = [
258
  translate_google(s["question"], "en", lang) for s in samples
evals/datasets_/truthfulqa.py CHANGED
@@ -14,11 +14,21 @@ from models import translate_google, get_google_supported_languages
14
  from datasets_.util import _get_dataset_config_names, _load_dataset
15
 
16
  slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
 
 
17
  tags_uhura_truthfulqa = {
18
  standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
19
  if a.endswith("multiple_choice")
20
  }
21
 
 
 
 
 
 
 
 
 
22
 
23
  def add_choices(row):
24
  row["choices"] = row["mc1_targets"]["choices"]
@@ -34,6 +44,15 @@ async def load_truthfulqa(language_bcp_47, nr):
34
  ds = ds.map(add_choices)
35
  task = ds["test"][nr]
36
  return "masakhane/uhura-truthfulqa", task, "human"
 
 
 
 
 
 
 
 
 
37
  else:
38
  # Fallback to on-the-fly translation for missing languages/samples
39
  return await load_truthfulqa_translated(language_bcp_47, nr)
@@ -52,7 +71,13 @@ async def load_truthfulqa_translated(language_bcp_47, nr):
52
  # Load English TruthfulQA data
53
  ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"])
54
  ds = ds.map(add_choices)
55
- task = ds["test"][nr]
 
 
 
 
 
 
56
 
57
  # Translate question and choices
58
  question_translated = await translate_google(task["question"], "en", language_bcp_47)
@@ -84,6 +109,9 @@ def translate_truthfulqa(languages):
84
  ]
85
  n_samples = 20
86
 
 
 
 
87
  slug = "fair-forward/truthfulqa-autotranslated"
88
  for lang in tqdm(untranslated):
89
  # check if already exists on hub
@@ -97,32 +125,40 @@ def translate_truthfulqa(languages):
97
  if split == "train":
98
  samples.extend(ds)
99
  else:
100
- for i in range(n_samples):
 
101
  task = ds[i]
102
  samples.append(task)
 
 
103
  questions_tr = [
104
  translate_google(s["question"], "en", lang) for s in samples
105
  ]
106
  questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
107
- choices_texts_concatenated = []
 
 
 
 
108
  for s in samples:
109
- for choice in eval(s["choices"]):
110
- choices_texts_concatenated.append(choice)
111
- choices_tr = [
112
- translate_google(c, "en", lang) for c in choices_texts_concatenated
113
- ]
114
- choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
115
- # group into chunks of 4
116
- choices_tr = [
117
- choices_tr[i : i + 4] for i in range(0, len(choices_tr), 4)
118
- ]
 
 
119
 
120
  ds_lang = Dataset.from_dict(
121
  {
122
- "subject": [s["subject"] for s in samples],
123
  "question": questions_tr,
124
- "choices": choices_tr,
125
- "answer": [s["answer"] for s in samples],
126
  }
127
  )
128
  ds_lang.push_to_hub(
 
14
  from datasets_.util import _get_dataset_config_names, _load_dataset
15
 
16
  slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
17
+ slug_truthfulqa_autotranslated = "fair-forward/truthfulqa-autotranslated"
18
+
19
  tags_uhura_truthfulqa = {
20
  standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
21
  if a.endswith("multiple_choice")
22
  }
23
 
24
+ # Get available auto-translated languages
25
+ try:
26
+ tags_truthfulqa_autotranslated = {
27
+ standardize_tag(a, macro=True): a for a in _get_dataset_config_names(slug_truthfulqa_autotranslated)
28
+ }
29
+ except Exception:
30
+ tags_truthfulqa_autotranslated = {}
31
+
32
 
33
  def add_choices(row):
34
  row["choices"] = row["mc1_targets"]["choices"]
 
44
  ds = ds.map(add_choices)
45
  task = ds["test"][nr]
46
  return "masakhane/uhura-truthfulqa", task, "human"
47
+ elif language_bcp_47 in tags_truthfulqa_autotranslated.keys():
48
+ # Load from auto-translated dataset (same samples as translation)
49
+ ds = _load_dataset(slug_truthfulqa_autotranslated, language_bcp_47)
50
+ test_split = ds["test"] if "test" in ds else ds
51
+ if nr < len(test_split):
52
+ task = test_split[nr]
53
+ return slug_truthfulqa_autotranslated, task, "machine"
54
+ # If requested index exceeds stored sample count, fall back to on-the-fly
55
+ return await load_truthfulqa_translated(language_bcp_47, nr)
56
  else:
57
  # Fallback to on-the-fly translation for missing languages/samples
58
  return await load_truthfulqa_translated(language_bcp_47, nr)
 
71
  # Load English TruthfulQA data
72
  ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"])
73
  ds = ds.map(add_choices)
74
+
75
+ # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
76
+ if nr < 20:
77
+ task = ds["test"][nr] # Direct mapping to same sample
78
+ else:
79
+ # Fallback to sequential if nr exceeds our sample count
80
+ task = ds["test"][nr % len(ds["test"])]
81
 
82
  # Translate question and choices
83
  question_translated = await translate_google(task["question"], "en", language_bcp_47)
 
109
  ]
110
  n_samples = 20
111
 
112
+ # Set fixed seed for consistent sample selection across all languages
113
+ random.seed(42)
114
+
115
  slug = "fair-forward/truthfulqa-autotranslated"
116
  for lang in tqdm(untranslated):
117
  # check if already exists on hub
 
125
  if split == "train":
126
  samples.extend(ds)
127
  else:
128
+ # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
129
+ for i in range(min(n_samples, len(ds))):
130
  task = ds[i]
131
  samples.append(task)
132
+
133
+ # Translate questions
134
  questions_tr = [
135
  translate_google(s["question"], "en", lang) for s in samples
136
  ]
137
  questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
138
+
139
+ # Translate choices for each sample
140
+ all_choices_tr = []
141
+ all_labels = []
142
+
143
  for s in samples:
144
+ # Get choices from mc1_targets
145
+ choices = s["mc1_targets"]["choices"]
146
+ labels = s["mc1_targets"]["labels"]
147
+
148
+ # Translate choices
149
+ choices_tr = [
150
+ translate_google(choice, "en", lang) for choice in choices
151
+ ]
152
+ choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
153
+
154
+ all_choices_tr.append(choices_tr)
155
+ all_labels.append(labels)
156
 
157
  ds_lang = Dataset.from_dict(
158
  {
 
159
  "question": questions_tr,
160
+ "choices": all_choices_tr,
161
+ "labels": all_labels,
162
  }
163
  )
164
  ds_lang.push_to_hub(
evals/models.py CHANGED
@@ -8,7 +8,11 @@ from os import getenv
8
  import pandas as pd
9
  from aiolimiter import AsyncLimiter
10
  from dotenv import load_dotenv
11
- from elevenlabs import AsyncElevenLabs
 
 
 
 
12
  from google.cloud import translate_v2 as translate
13
  from huggingface_hub import AsyncInferenceClient, HfApi
14
  from joblib.memory import Memory
 
8
  import pandas as pd
9
  from aiolimiter import AsyncLimiter
10
  from dotenv import load_dotenv
11
+ # Make ElevenLabs optional to avoid hard dependency when not using speech tasks
12
+ try:
13
+ from elevenlabs import AsyncElevenLabs
14
+ except Exception: # ImportError or other env-specific issues
15
+ AsyncElevenLabs = None
16
  from google.cloud import translate_v2 as translate
17
  from huggingface_hub import AsyncInferenceClient, HfApi
18
  from joblib.memory import Memory