davidpomerenke commited on
Commit
56adaa2
·
verified ·
1 Parent(s): 0fa7824

Upload from GitHub Actions: Add Todos for using existing machine-translated datasets rather than our own ones

Browse files
evals/datasets_/arc.py CHANGED
@@ -1,11 +1,10 @@
1
  import random
2
- from collections import Counter, defaultdict
3
 
4
- from langcodes import Language, standardize_tag
5
  from rich import print
6
  from models import translate_google, get_google_supported_languages
7
  from tqdm import tqdm
8
- from datasets import load_dataset
9
  import asyncio
10
  from tqdm.asyncio import tqdm_asyncio
11
  import os
@@ -62,7 +61,6 @@ def load_uhura_arc_easy(language_bcp_47, nr):
62
  task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
63
  return "fair-forward/arc-easy-autotranslated", task, "machine"
64
  else:
65
- # ARC does not support on-the-fly translation currently
66
  return None, None, None
67
 
68
 
 
1
  import random
 
2
 
3
+ from langcodes import standardize_tag
4
  from rich import print
5
  from models import translate_google, get_google_supported_languages
6
  from tqdm import tqdm
7
+ from datasets import load_dataset, Dataset
8
  import asyncio
9
  from tqdm.asyncio import tqdm_asyncio
10
  import os
 
61
  task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
62
  return "fair-forward/arc-easy-autotranslated", task, "machine"
63
  else:
 
64
  return None, None, None
65
 
66
 
evals/datasets_/mgsm.py CHANGED
@@ -49,13 +49,6 @@ def load_mgsm(language_bcp_47, nr):
49
  slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
50
  )
51
  return slug_afrimgsm, ds[nr], "human"
52
- elif language_bcp_47 in tags_gsm_autotranslated.keys():
53
- ds = _load_dataset(
54
- slug_gsm_autotranslated,
55
- subset=tags_gsm_autotranslated[language_bcp_47],
56
- split="test",
57
- )
58
- return slug_gsm_autotranslated, ds[nr], "machine"
59
  elif language_bcp_47 in tags_gsm8kx.keys():
60
  row = _load_dataset(
61
  slug_gsm8kx,
@@ -64,7 +57,14 @@ def load_mgsm(language_bcp_47, nr):
64
  trust_remote_code=True,
65
  )[nr]
66
  row["answer_number"] = row["answer"].split("####")[1].strip()
67
- return slug_gsm8kx, row, "human" # Assuming Eurolingua is human-translated
 
 
 
 
 
 
 
68
  else:
69
  return None, None, None
70
 
 
49
  slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
50
  )
51
  return slug_afrimgsm, ds[nr], "human"
 
 
 
 
 
 
 
52
  elif language_bcp_47 in tags_gsm8kx.keys():
53
  row = _load_dataset(
54
  slug_gsm8kx,
 
57
  trust_remote_code=True,
58
  )[nr]
59
  row["answer_number"] = row["answer"].split("####")[1].strip()
60
+ return slug_gsm8kx, row, "machine"
61
+ elif language_bcp_47 in tags_gsm_autotranslated.keys():
62
+ ds = _load_dataset(
63
+ slug_gsm_autotranslated,
64
+ subset=tags_gsm_autotranslated[language_bcp_47],
65
+ split="test",
66
+ )
67
+ return slug_gsm_autotranslated, ds[nr], "machine"
68
  else:
69
  return None, None, None
70
 
evals/datasets_/mmlu.py CHANGED
@@ -164,65 +164,13 @@ async def load_mmlu(language_bcp_47, nr):
164
  ds = ds.map(add_choices)
165
  task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
166
  return "CohereForAI/Global-MMLU", task, "human"
 
167
  elif language_bcp_47 in tags_mmlu_autotranslated:
168
  ds = _load_dataset("fair-forward/mmlu-autotranslated", language_bcp_47)
169
  filtered = ds["test"].filter(lambda x: x["subject"] == category)
170
- if nr < len(filtered):
171
- task = filtered[nr]
172
- return "fair-forward/mmlu-autotranslated", task, "machine"
173
- # Requested index exceeds stored sample count → fallback to on-the-fly
174
- return await load_mmlu_translated(language_bcp_47, nr)
175
  else:
176
- # Fallback to on-the-fly translation for missing languages
177
- return await load_mmlu_translated(language_bcp_47, nr)
178
-
179
-
180
- async def load_mmlu_translated(language_bcp_47, nr):
181
- """
182
- Load MMLU data with on-the-fly Google translation for languages
183
- without native or stored auto-translated MMLU, or when more samples are requested.
184
- """
185
- supported_languages = get_google_supported_languages()
186
- if language_bcp_47 not in supported_languages:
187
- return None, None, None
188
-
189
- print(f"🔄 Translating MMLU data to {language_bcp_47} on-the-fly...")
190
-
191
- try:
192
- # Load English MMLU base (AfriMMLU English split for category alignment)
193
- category = categories[nr % len(categories)]
194
- ds = _load_dataset("masakhane/afrimmlu", "eng")
195
- ds = ds.map(parse_choices)
196
- filtered = ds["test"].filter(lambda x: x["subject"] == category)
197
- if len(filtered) == 0:
198
- return None, None, None
199
-
200
- # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
201
- if nr < 20:
202
- task = filtered[nr] # Direct mapping to same sample
203
- else:
204
- # Fallback to sequential if nr exceeds our sample count
205
- task = filtered[nr % len(filtered)]
206
-
207
- # Translate question and choices
208
- question_translated = await translate_google(task["question"], "en", language_bcp_47)
209
- choices_translated = []
210
- for choice in task["choices"]:
211
- choice_translated = await translate_google(choice, "en", language_bcp_47)
212
- choices_translated.append(choice_translated)
213
-
214
- # Create translated task
215
- translated_task = {
216
- "question": question_translated,
217
- "choices": choices_translated,
218
- "answer": task["answer"], # Keep original answer index
219
- "subject": task["subject"],
220
- }
221
-
222
- return f"mmlu-translated-{language_bcp_47}", translated_task, "machine"
223
-
224
- except Exception as e:
225
- print(f"❌ Translation failed for {language_bcp_47}: {e}")
226
  return None, None, None
227
 
228
 
 
164
  ds = ds.map(add_choices)
165
  task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
166
  return "CohereForAI/Global-MMLU", task, "human"
167
+ # TODO: add in Okapi, MMLUX @Jonas
168
  elif language_bcp_47 in tags_mmlu_autotranslated:
169
  ds = _load_dataset("fair-forward/mmlu-autotranslated", language_bcp_47)
170
  filtered = ds["test"].filter(lambda x: x["subject"] == category)
171
+ task = filtered[nr]
172
+ return "fair-forward/mmlu-autotranslated", task, "machine"
 
 
 
173
  else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  return None, None, None
175
 
176
 
evals/datasets_/truthfulqa.py CHANGED
@@ -48,58 +48,12 @@ async def load_truthfulqa(language_bcp_47, nr):
48
  # Load from auto-translated dataset (same samples as translation)
49
  ds = _load_dataset(slug_truthfulqa_autotranslated, language_bcp_47)
50
  test_split = ds["test"] if "test" in ds else ds
51
- if nr < len(test_split):
52
- task = test_split[nr]
53
- return slug_truthfulqa_autotranslated, task, "machine"
54
- # If requested index exceeds stored sample count, fall back to on-the-fly
55
- return await load_truthfulqa_translated(language_bcp_47, nr)
56
  else:
57
- # Fallback to on-the-fly translation for missing languages/samples
58
- return await load_truthfulqa_translated(language_bcp_47, nr)
59
-
60
- async def load_truthfulqa_translated(language_bcp_47, nr):
61
- """
62
- Load TruthfulQA data with on-the-fly Google translation.
63
- """
64
- supported_languages = get_google_supported_languages()
65
- if language_bcp_47 not in supported_languages:
66
  return None, None, None
67
 
68
- print(f"🔄 Translating TruthfulQA data to {language_bcp_47} on-the-fly...")
69
-
70
- try:
71
- # Load English TruthfulQA data
72
- ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"])
73
- ds = ds.map(add_choices)
74
-
75
- # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
76
- if nr < 20:
77
- task = ds["test"][nr] # Direct mapping to same sample
78
- else:
79
- # Fallback to sequential if nr exceeds our sample count
80
- task = ds["test"][nr % len(ds["test"])]
81
-
82
- # Translate question and choices
83
- question_translated = await translate_google(task["question"], "en", language_bcp_47)
84
- choices_translated = []
85
- for choice in task["choices"]:
86
- choice_translated = await translate_google(choice, "en", language_bcp_47)
87
- choices_translated.append(choice_translated)
88
-
89
- translated_task = {
90
- "question": question_translated,
91
- "choices": choices_translated,
92
- "labels": task["labels"], # Keep original labels
93
- }
94
-
95
- return f"truthfulqa-translated-{language_bcp_47}", translated_task, "machine"
96
-
97
- except Exception as e:
98
- print(f"❌ Translation failed for {language_bcp_47}: {e}")
99
- return None, None, None
100
-
101
-
102
-
103
  def translate_truthfulqa(languages):
104
  human_translated = [*tags_uhura_truthfulqa.keys()]
105
  untranslated = [
 
48
  # Load from auto-translated dataset (same samples as translation)
49
  ds = _load_dataset(slug_truthfulqa_autotranslated, language_bcp_47)
50
  test_split = ds["test"] if "test" in ds else ds
51
+ task = test_split[nr]
52
+ return slug_truthfulqa_autotranslated, task, "machine"
53
+ # TODO: add Okapi, TruthfulQA-X @Jonas
 
 
54
  else:
 
 
 
 
 
 
 
 
 
55
  return None, None, None
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  def translate_truthfulqa(languages):
58
  human_translated = [*tags_uhura_truthfulqa.keys()]
59
  untranslated = [
evals/tasks.py CHANGED
@@ -120,32 +120,22 @@ Reply with only the topic name.
120
  Text:
121
  {test_paragraph.text}
122
  """
123
-
124
- # some models have poor tokenization for some languages, and the prompt for this task is relatively long, so it sometimes exceeds the context window
125
- # this is not just to blame on the context window but mostly on the model's tokenization, so we assign 0 accuracy in this case
126
- try:
127
- pred = await complete(
128
- model=model,
129
- messages=[{"role": "user", "content": prompt}],
130
- temperature=0,
131
- max_tokens=30,
132
- )
133
- true = test_paragraph.topic
134
- others = [t for t in top_topics if t != true]
135
- acc = (
136
- int(
137
- pred.startswith(true)
138
- or (true in pred and not any(o in pred for o in others))
139
- )
140
- if pred
141
- else 0
142
  )
143
- except Exception as e:
144
- if "`inputs` tokens + `max_new_tokens` must be <= 4097" in str(e):
145
- print(f"Max tokens exceeded for {model} in {bcp_47}")
146
- acc = 0
147
- else:
148
- raise e
149
  return [
150
  {
151
  "model": model,
@@ -331,7 +321,6 @@ def format_multiple_choice_truthfulqa(item):
331
  text = item["question"] + "\n\n"
332
  for i, choice in enumerate(item["choices"]):
333
  text += f"{letters[i]}: {choice}\n"
334
- text += "|".join(letters[: len(item["choices"])]) + "?"
335
  return text
336
 
337
 
 
120
  Text:
121
  {test_paragraph.text}
122
  """
123
+ pred = await complete(
124
+ model=model,
125
+ messages=[{"role": "user", "content": prompt}],
126
+ temperature=0,
127
+ max_tokens=30,
128
+ ).lower().strip()
129
+ true = test_paragraph.topic.lower().strip()
130
+ others = [t for t in top_topics if t != true]
131
+ acc = (
132
+ int(
133
+ pred.startswith(true)
134
+ or (true in pred and not any(o in pred for o in others))
 
 
 
 
 
 
 
135
  )
136
+ if pred
137
+ else 0
138
+ )
 
 
 
139
  return [
140
  {
141
  "model": model,
 
321
  text = item["question"] + "\n\n"
322
  for i, choice in enumerate(item["choices"]):
323
  text += f"{letters[i]}: {choice}\n"
 
324
  return text
325
 
326