Spaces:

fair-forward
/

evals-for-every-language

Running

File size: 6,633 Bytes

import random
from collections import Counter, defaultdict

from langcodes import Language, standardize_tag
from rich import print
from tqdm import tqdm
import asyncio
from tqdm.asyncio import tqdm_asyncio
import os

from datasets import Dataset, load_dataset
from models import translate_google, get_google_supported_languages

from datasets_.util import _get_dataset_config_names, _load_dataset

slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
slug_truthfulqa_autotranslated = "fair-forward/truthfulqa-autotranslated"

tags_uhura_truthfulqa = {
    standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
    if a.endswith("multiple_choice")
}

# Get available auto-translated languages
try:
    tags_truthfulqa_autotranslated = {
        standardize_tag(a, macro=True): a for a in _get_dataset_config_names(slug_truthfulqa_autotranslated)
    }
except Exception:
    tags_truthfulqa_autotranslated = {}


def add_choices(row):
    row["choices"] = row["mc1_targets"]["choices"]
    row["labels"] = row["mc1_targets"]["labels"]
    return row


async def load_truthfulqa(language_bcp_47, nr):
    if language_bcp_47 in tags_uhura_truthfulqa.keys():
        ds = _load_dataset(
            slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47]
        )
        ds = ds.map(add_choices)
        task = ds["test"][nr]
        return "masakhane/uhura-truthfulqa", task, "human"
    elif language_bcp_47 in tags_truthfulqa_autotranslated.keys():
        # Load from auto-translated dataset (same samples as translation)
        ds = _load_dataset(slug_truthfulqa_autotranslated, language_bcp_47)
        test_split = ds["test"] if "test" in ds else ds
        if nr < len(test_split):
            task = test_split[nr]
            return slug_truthfulqa_autotranslated, task, "machine"
        # If requested index exceeds stored sample count, fall back to on-the-fly
        return await load_truthfulqa_translated(language_bcp_47, nr)
    else:
        # Fallback to on-the-fly translation for missing languages/samples
        return await load_truthfulqa_translated(language_bcp_47, nr)

async def load_truthfulqa_translated(language_bcp_47, nr):
    """
    Load TruthfulQA data with on-the-fly Google translation.
    """
    supported_languages = get_google_supported_languages()
    if language_bcp_47 not in supported_languages:
        return None, None, None

    print(f"🔄 Translating TruthfulQA data to {language_bcp_47} on-the-fly...")

    try:
        # Load English TruthfulQA data
        ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"])
        ds = ds.map(add_choices)
        
        # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
        if nr < 20:
            task = ds["test"][nr]  # Direct mapping to same sample
        else:
            # Fallback to sequential if nr exceeds our sample count
            task = ds["test"][nr % len(ds["test"])]

        # Translate question and choices
        question_translated = await translate_google(task["question"], "en", language_bcp_47)
        choices_translated = []
        for choice in task["choices"]:
            choice_translated = await translate_google(choice, "en", language_bcp_47)
            choices_translated.append(choice_translated)

        translated_task = {
            "question": question_translated,
            "choices": choices_translated,
            "labels": task["labels"], # Keep original labels
        }

        return f"truthfulqa-translated-{language_bcp_47}", translated_task, "machine"

    except Exception as e:
        print(f"❌ Translation failed for {language_bcp_47}: {e}")
        return None, None, None



def translate_truthfulqa(languages):
    human_translated = [*tags_uhura_truthfulqa.keys()]
    untranslated = [
        lang
        for lang in languages["bcp_47"].values[:150]
        if lang not in human_translated and lang in get_google_supported_languages()
    ]
    n_samples = 20

    # Set fixed seed for consistent sample selection across all languages
    random.seed(42)
    
    slug = "fair-forward/truthfulqa-autotranslated"
    for lang in tqdm(untranslated):
        # check if already exists on hub
        try:
            ds_lang = load_dataset(slug, lang)
        except (ValueError, Exception):
            print(f"Translating {lang}...")
            for split in ["train", "test"]:
                ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"], split=split)
                samples = []
                if split == "train":
                    samples.extend(ds)
                else:
                    # Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
                    for i in range(min(n_samples, len(ds))):
                        task = ds[i]
                        samples.append(task)
                
                # Translate questions
                questions_tr = [
                    translate_google(s["question"], "en", lang) for s in samples
                ]
                questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
                
                # Translate choices for each sample
                all_choices_tr = []
                all_labels = []
                
                for s in samples:
                    # Get choices from mc1_targets
                    choices = s["mc1_targets"]["choices"]
                    labels = s["mc1_targets"]["labels"]
                    
                    # Translate choices
                    choices_tr = [
                        translate_google(choice, "en", lang) for choice in choices
                    ]
                    choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
                    
                    all_choices_tr.append(choices_tr)
                    all_labels.append(labels)

                ds_lang = Dataset.from_dict(
                    {
                        "question": questions_tr,
                        "choices": all_choices_tr,
                        "labels": all_labels,
                    }
                )
                ds_lang.push_to_hub(
                    slug,
                    split=split,
                    config_name=lang,
                    token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
                )
                ds_lang.to_json(
                    f"data/translations/truthfulqa/{lang}_{split}.json",
                    lines=False,
                    force_ascii=False,
                    indent=2,
                )