
Upload from GitHub Actions: Add Todos for using existing machine-translated datasets rather than our own ones
56adaa2
verified
import random | |
from collections import Counter, defaultdict | |
from langcodes import Language, standardize_tag | |
from rich import print | |
from tqdm import tqdm | |
import asyncio | |
from tqdm.asyncio import tqdm_asyncio | |
import os | |
from datasets import Dataset, load_dataset | |
from models import translate_google, get_google_supported_languages | |
from datasets_.util import _get_dataset_config_names, _load_dataset | |
slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa" | |
slug_truthfulqa_autotranslated = "fair-forward/truthfulqa-autotranslated" | |
tags_uhura_truthfulqa = { | |
standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa) | |
if a.endswith("multiple_choice") | |
} | |
# Get available auto-translated languages | |
try: | |
tags_truthfulqa_autotranslated = { | |
standardize_tag(a, macro=True): a for a in _get_dataset_config_names(slug_truthfulqa_autotranslated) | |
} | |
except Exception: | |
tags_truthfulqa_autotranslated = {} | |
def add_choices(row): | |
row["choices"] = row["mc1_targets"]["choices"] | |
row["labels"] = row["mc1_targets"]["labels"] | |
return row | |
async def load_truthfulqa(language_bcp_47, nr): | |
if language_bcp_47 in tags_uhura_truthfulqa.keys(): | |
ds = _load_dataset( | |
slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47] | |
) | |
ds = ds.map(add_choices) | |
task = ds["test"][nr] | |
return "masakhane/uhura-truthfulqa", task, "human" | |
elif language_bcp_47 in tags_truthfulqa_autotranslated.keys(): | |
# Load from auto-translated dataset (same samples as translation) | |
ds = _load_dataset(slug_truthfulqa_autotranslated, language_bcp_47) | |
test_split = ds["test"] if "test" in ds else ds | |
task = test_split[nr] | |
return slug_truthfulqa_autotranslated, task, "machine" | |
# TODO: add Okapi, TruthfulQA-X @Jonas | |
else: | |
return None, None, None | |
def translate_truthfulqa(languages): | |
human_translated = [*tags_uhura_truthfulqa.keys()] | |
untranslated = [ | |
lang | |
for lang in languages["bcp_47"].values[:150] | |
if lang not in human_translated and lang in get_google_supported_languages() | |
] | |
n_samples = 20 | |
# Set fixed seed for consistent sample selection across all languages | |
random.seed(42) | |
slug = "fair-forward/truthfulqa-autotranslated" | |
for lang in tqdm(untranslated): | |
# check if already exists on hub | |
try: | |
ds_lang = load_dataset(slug, lang) | |
except (ValueError, Exception): | |
print(f"Translating {lang}...") | |
for split in ["train", "test"]: | |
ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"], split=split) | |
samples = [] | |
if split == "train": | |
samples.extend(ds) | |
else: | |
# Use the same 20 samples that the evaluation pipeline uses (indices 0-19) | |
for i in range(min(n_samples, len(ds))): | |
task = ds[i] | |
samples.append(task) | |
# Translate questions | |
questions_tr = [ | |
translate_google(s["question"], "en", lang) for s in samples | |
] | |
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr)) | |
# Translate choices for each sample | |
all_choices_tr = [] | |
all_labels = [] | |
for s in samples: | |
# Get choices from mc1_targets | |
choices = s["mc1_targets"]["choices"] | |
labels = s["mc1_targets"]["labels"] | |
# Translate choices | |
choices_tr = [ | |
translate_google(choice, "en", lang) for choice in choices | |
] | |
choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr)) | |
all_choices_tr.append(choices_tr) | |
all_labels.append(labels) | |
ds_lang = Dataset.from_dict( | |
{ | |
"question": questions_tr, | |
"choices": all_choices_tr, | |
"labels": all_labels, | |
} | |
) | |
ds_lang.push_to_hub( | |
slug, | |
split=split, | |
config_name=lang, | |
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"), | |
) | |
ds_lang.to_json( | |
f"data/translations/truthfulqa/{lang}_{split}.json", | |
lines=False, | |
force_ascii=False, | |
indent=2, | |
) | |