Spaces:

fair-forward
/

evals-for-every-language

Running

App Files Files Community

davidpomerenke commited on 3 days ago

Commit

8f5ce26

verified ·

1 Parent(s): b8cbeff

Upload from GitHub Actions: updated translation functions

Browse files

Files changed (3) hide show

evals/datasets_/mmlu.py +13 -4
evals/datasets_/truthfulqa.py +52 -16
evals/models.py +5 -1

evals/datasets_/mmlu.py CHANGED Viewed

@@ -111,6 +111,7 @@ def print_datasets_analysis():
 # MMLUX is translated using DeepL
 # Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
 # print_datasets_analysis()
@@ -195,7 +196,13 @@ async def load_mmlu_translated(language_bcp_47, nr):
         filtered = ds["test"].filter(lambda x: x["subject"] == category)
         if len(filtered) == 0:
             return None, None, None
-        task = filtered[nr % len(filtered)]
         # Translate question and choices
         question_translated = await translate_google(task["question"], "en", language_bcp_47)
@@ -226,7 +233,7 @@ def translate_mmlu(languages):
         for lang in languages["bcp_47"].values[:150]
         if lang not in human_translated and lang in get_google_supported_languages()
     ]
-    n_samples = 10
     slug = "fair-forward/mmlu-autotranslated"
     for lang in tqdm(untranslated):
@@ -242,8 +249,10 @@ def translate_mmlu(languages):
                     if split == "dev":
                         samples.extend(ds.filter(lambda x: x["subject"] == category))
                     else:
-                        for i in range(n_samples):
-                            task = ds.filter(lambda x: x["subject"] == category)[i]
                             samples.append(task)
                 questions_tr = [
                     translate_google(s["question"], "en", lang) for s in samples

 # MMLUX is translated using DeepL
 # Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
 # print_datasets_analysis()
         filtered = ds["test"].filter(lambda x: x["subject"] == category)
         if len(filtered) == 0:
             return None, None, None
+        # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
+        if nr < 20:
+            task = filtered[nr]  # Direct mapping to same sample
+        else:
+            # Fallback to sequential if nr exceeds our sample count
+            task = filtered[nr % len(filtered)]
         # Translate question and choices
         question_translated = await translate_google(task["question"], "en", language_bcp_47)
         for lang in languages["bcp_47"].values[:150]
         if lang not in human_translated and lang in get_google_supported_languages()
     ]
+    n_samples = 20
     slug = "fair-forward/mmlu-autotranslated"
     for lang in tqdm(untranslated):
                     if split == "dev":
                         samples.extend(ds.filter(lambda x: x["subject"] == category))
                     else:
+                        # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
+                        filtered = ds.filter(lambda x: x["subject"] == category)
+                        for i in range(min(n_samples, len(filtered))):
+                            task = filtered[i]
                             samples.append(task)
                 questions_tr = [
                     translate_google(s["question"], "en", lang) for s in samples

evals/datasets_/truthfulqa.py CHANGED Viewed

@@ -14,11 +14,21 @@ from models import translate_google, get_google_supported_languages
 from datasets_.util import _get_dataset_config_names, _load_dataset
 slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
 tags_uhura_truthfulqa = {
     standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
     if a.endswith("multiple_choice")
 }
 def add_choices(row):
     row["choices"] = row["mc1_targets"]["choices"]
@@ -34,6 +44,15 @@ async def load_truthfulqa(language_bcp_47, nr):
         ds = ds.map(add_choices)
         task = ds["test"][nr]
         return "masakhane/uhura-truthfulqa", task, "human"
     else:
         # Fallback to on-the-fly translation for missing languages/samples
         return await load_truthfulqa_translated(language_bcp_47, nr)
@@ -52,7 +71,13 @@ async def load_truthfulqa_translated(language_bcp_47, nr):
         # Load English TruthfulQA data
         ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"])
         ds = ds.map(add_choices)
-        task = ds["test"][nr]
         # Translate question and choices
         question_translated = await translate_google(task["question"], "en", language_bcp_47)
@@ -84,6 +109,9 @@ def translate_truthfulqa(languages):
     ]
     n_samples = 20
     slug = "fair-forward/truthfulqa-autotranslated"
     for lang in tqdm(untranslated):
         # check if already exists on hub
@@ -97,32 +125,40 @@ def translate_truthfulqa(languages):
                 if split == "train":
                     samples.extend(ds)
                 else:
-                    for i in range(n_samples):
                         task = ds[i]
                         samples.append(task)
                 questions_tr = [
                     translate_google(s["question"], "en", lang) for s in samples
                 ]
                 questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
-                choices_texts_concatenated = []
                 for s in samples:
-                    for choice in eval(s["choices"]):
-                        choices_texts_concatenated.append(choice)
-                choices_tr = [
-                    translate_google(c, "en", lang) for c in choices_texts_concatenated
-                ]
-                choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
-                # group into chunks of 4
-                choices_tr = [
-                    choices_tr[i : i + 4] for i in range(0, len(choices_tr), 4)
-                ]
                 ds_lang = Dataset.from_dict(
                     {
-                        "subject": [s["subject"] for s in samples],
                         "question": questions_tr,
-                        "choices": choices_tr,
-                        "answer": [s["answer"] for s in samples],
                     }
                 )
                 ds_lang.push_to_hub(

 from datasets_.util import _get_dataset_config_names, _load_dataset
 slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
+slug_truthfulqa_autotranslated = "fair-forward/truthfulqa-autotranslated"
 tags_uhura_truthfulqa = {
     standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
     if a.endswith("multiple_choice")
 }
+# Get available auto-translated languages
+try:
+    tags_truthfulqa_autotranslated = {
+        standardize_tag(a, macro=True): a for a in _get_dataset_config_names(slug_truthfulqa_autotranslated)
+    }
+except Exception:
+    tags_truthfulqa_autotranslated = {}
 def add_choices(row):
     row["choices"] = row["mc1_targets"]["choices"]
         ds = ds.map(add_choices)
         task = ds["test"][nr]
         return "masakhane/uhura-truthfulqa", task, "human"
+    elif language_bcp_47 in tags_truthfulqa_autotranslated.keys():
+        # Load from auto-translated dataset (same samples as translation)
+        ds = _load_dataset(slug_truthfulqa_autotranslated, language_bcp_47)
+        test_split = ds["test"] if "test" in ds else ds
+        if nr < len(test_split):
+            task = test_split[nr]
+            return slug_truthfulqa_autotranslated, task, "machine"
+        # If requested index exceeds stored sample count, fall back to on-the-fly
+        return await load_truthfulqa_translated(language_bcp_47, nr)
     else:
         # Fallback to on-the-fly translation for missing languages/samples
         return await load_truthfulqa_translated(language_bcp_47, nr)
         # Load English TruthfulQA data
         ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"])
         ds = ds.map(add_choices)
+        # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
+        if nr < 20:
+            task = ds["test"][nr]  # Direct mapping to same sample
+        else:
+            # Fallback to sequential if nr exceeds our sample count
+            task = ds["test"][nr % len(ds["test"])]
         # Translate question and choices
         question_translated = await translate_google(task["question"], "en", language_bcp_47)
     ]
     n_samples = 20
+    # Set fixed seed for consistent sample selection across all languages
+    random.seed(42)
     slug = "fair-forward/truthfulqa-autotranslated"
     for lang in tqdm(untranslated):
         # check if already exists on hub
                 if split == "train":
                     samples.extend(ds)
                 else:
+                    # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
+                    for i in range(min(n_samples, len(ds))):
                         task = ds[i]
                         samples.append(task)
+                # Translate questions
                 questions_tr = [
                     translate_google(s["question"], "en", lang) for s in samples
                 ]
                 questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
+                # Translate choices for each sample
+                all_choices_tr = []
+                all_labels = []
                 for s in samples:
+                    # Get choices from mc1_targets
+                    choices = s["mc1_targets"]["choices"]
+                    labels = s["mc1_targets"]["labels"]
+                    # Translate choices
+                    choices_tr = [
+                        translate_google(choice, "en", lang) for choice in choices
+                    ]
+                    choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
+                    all_choices_tr.append(choices_tr)
+                    all_labels.append(labels)
                 ds_lang = Dataset.from_dict(
                     {
                         "question": questions_tr,
+                        "choices": all_choices_tr,
+                        "labels": all_labels,
                     }
                 )
                 ds_lang.push_to_hub(

evals/models.py CHANGED Viewed

@@ -8,7 +8,11 @@ from os import getenv
 import pandas as pd
 from aiolimiter import AsyncLimiter
 from dotenv import load_dotenv
-from elevenlabs import AsyncElevenLabs
 from google.cloud import translate_v2 as translate
 from huggingface_hub import AsyncInferenceClient, HfApi
 from joblib.memory import Memory

 import pandas as pd
 from aiolimiter import AsyncLimiter
 from dotenv import load_dotenv
+# Make ElevenLabs optional to avoid hard dependency when not using speech tasks
+try:
+    from elevenlabs import AsyncElevenLabs
+except Exception:  # ImportError or other env-specific issues
+    AsyncElevenLabs = None
 from google.cloud import translate_v2 as translate
 from huggingface_hub import AsyncInferenceClient, HfApi
 from joblib.memory import Memory