David Pomerenke
commited on
Commit
·
723f963
1
Parent(s):
b4a0c57
Process data for country map
Browse files- evals/countries.py +50 -0
- evals/languages.py +0 -9
- evals/main.py +39 -7
- frontend/public/results.json +0 -0
- results.json +21 -21
evals/countries.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import xml.etree.ElementTree as ET
|
| 3 |
+
from collections import defaultdict
|
| 4 |
+
|
| 5 |
+
from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
|
| 6 |
+
from language_data.util import data_filename
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def get_population_data():
|
| 10 |
+
filename = data_filename("supplementalData.xml")
|
| 11 |
+
root = ET.fromstring(open(filename).read())
|
| 12 |
+
territories = root.findall("./territoryInfo/territory")
|
| 13 |
+
|
| 14 |
+
data = {}
|
| 15 |
+
for territory in territories:
|
| 16 |
+
t_code = territory.attrib["type"]
|
| 17 |
+
t_population = float(territory.attrib["population"])
|
| 18 |
+
data[t_code] = t_population
|
| 19 |
+
return data
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def population(bcp_47):
|
| 23 |
+
items = {
|
| 24 |
+
re.sub(r"^[a-z]+-", "", lang): pop
|
| 25 |
+
for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
|
| 26 |
+
if re.match(rf"^{bcp_47}-[A-Z]{{2}}$", lang)
|
| 27 |
+
}
|
| 28 |
+
return items
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def make_country_table(language_table):
|
| 32 |
+
countries = defaultdict(list)
|
| 33 |
+
for lang in language_table.itertuples():
|
| 34 |
+
for country, pop in population(lang.bcp_47).items():
|
| 35 |
+
countries[country].append(
|
| 36 |
+
{
|
| 37 |
+
"name": lang.language_name,
|
| 38 |
+
"bcp_47": lang.bcp_47,
|
| 39 |
+
"population": pop,
|
| 40 |
+
"score": lang.average,
|
| 41 |
+
}
|
| 42 |
+
)
|
| 43 |
+
for country, languages in countries.items():
|
| 44 |
+
pop = sum(entry["population"] for entry in languages)
|
| 45 |
+
score = sum(entry["score"] * entry["population"] for entry in languages) / pop
|
| 46 |
+
countries[country] = {
|
| 47 |
+
"score": score,
|
| 48 |
+
"languages": languages,
|
| 49 |
+
}
|
| 50 |
+
return countries
|
evals/languages.py
CHANGED
|
@@ -46,15 +46,6 @@ scripts = pd.read_csv("data/ScriptCodes.csv").rename(
|
|
| 46 |
columns={"Code": "iso15924", "English Name": "script_name"}
|
| 47 |
)
|
| 48 |
|
| 49 |
-
|
| 50 |
-
def population(bcp_47):
|
| 51 |
-
items = {
|
| 52 |
-
re.sub(r"^[a-z]+-", "", lang): pop
|
| 53 |
-
for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
|
| 54 |
-
if re.match(rf"^{bcp_47}-[A-Z]{{2}}$", lang)
|
| 55 |
-
}
|
| 56 |
-
return items
|
| 57 |
-
|
| 58 |
def script_name(iso15924):
|
| 59 |
return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
|
| 60 |
|
|
|
|
| 46 |
columns={"Code": "iso15924", "English Name": "script_name"}
|
| 47 |
)
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
def script_name(iso15924):
|
| 50 |
return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
|
| 51 |
|
evals/main.py
CHANGED
|
@@ -3,11 +3,12 @@ import json
|
|
| 3 |
|
| 4 |
import numpy as np
|
| 5 |
import pandas as pd
|
| 6 |
-
from
|
| 7 |
-
from tqdm.asyncio import tqdm_asyncio
|
| 8 |
from languages import languages
|
|
|
|
|
|
|
| 9 |
from tasks import tasks
|
| 10 |
-
from
|
| 11 |
|
| 12 |
# ===== config =====
|
| 13 |
|
|
@@ -91,7 +92,20 @@ def make_model_table(df):
|
|
| 91 |
df["provider"] = df["model"].str.split("/").str[0].apply(fmt_name)
|
| 92 |
df["model"] = df["model"].str.split("/").str[1].apply(fmt_name)
|
| 93 |
df["rank"] = df.index + 1
|
| 94 |
-
df = df[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
return df
|
| 96 |
|
| 97 |
|
|
@@ -99,15 +113,31 @@ def make_language_table(df):
|
|
| 99 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
| 100 |
df = df.drop(columns=["task", "metric"])
|
| 101 |
task_metrics = df["task_metric"].unique()
|
| 102 |
-
df =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
df["average"] = df[task_metrics].mean(axis=1)
|
| 104 |
for row in [*task_metrics, "average"]:
|
| 105 |
df[row] = df[row].round(2)
|
| 106 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
| 107 |
df = df.sort_values(by="speakers", ascending=False)
|
| 108 |
-
df = df[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
return df
|
| 110 |
|
|
|
|
| 111 |
async def main():
|
| 112 |
results = await evaluate()
|
| 113 |
results, lang_results, model_results, task_results = aggregate(results)
|
|
@@ -121,10 +151,12 @@ async def main():
|
|
| 121 |
json.dump(all_results, f, indent=2, ensure_ascii=False)
|
| 122 |
|
| 123 |
datasets_df = pd.read_json("data/datasets.json")
|
|
|
|
| 124 |
all_tables = {
|
| 125 |
"model_table": serialize(make_model_table(model_results)),
|
| 126 |
-
"language_table": serialize(
|
| 127 |
"dataset_table": serialize(datasets_df),
|
|
|
|
| 128 |
}
|
| 129 |
with open("frontend/public/results.json", "w") as f:
|
| 130 |
json.dump(all_tables, f, indent=2, ensure_ascii=False)
|
|
|
|
| 3 |
|
| 4 |
import numpy as np
|
| 5 |
import pandas as pd
|
| 6 |
+
from countries import make_country_table
|
|
|
|
| 7 |
from languages import languages
|
| 8 |
+
from models import model_fast, models
|
| 9 |
+
from rich import print
|
| 10 |
from tasks import tasks
|
| 11 |
+
from tqdm.asyncio import tqdm_asyncio
|
| 12 |
|
| 13 |
# ===== config =====
|
| 14 |
|
|
|
|
| 92 |
df["provider"] = df["model"].str.split("/").str[0].apply(fmt_name)
|
| 93 |
df["model"] = df["model"].str.split("/").str[1].apply(fmt_name)
|
| 94 |
df["rank"] = df.index + 1
|
| 95 |
+
df = df[
|
| 96 |
+
[
|
| 97 |
+
"rank",
|
| 98 |
+
"provider",
|
| 99 |
+
"model",
|
| 100 |
+
"hf_id",
|
| 101 |
+
"creation_date",
|
| 102 |
+
"size",
|
| 103 |
+
"type",
|
| 104 |
+
"license",
|
| 105 |
+
"average",
|
| 106 |
+
*task_metrics,
|
| 107 |
+
]
|
| 108 |
+
]
|
| 109 |
return df
|
| 110 |
|
| 111 |
|
|
|
|
| 113 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
| 114 |
df = df.drop(columns=["task", "metric"])
|
| 115 |
task_metrics = df["task_metric"].unique()
|
| 116 |
+
df = (
|
| 117 |
+
df.pivot(index="bcp_47", columns="task_metric", values="score")
|
| 118 |
+
.fillna(0)
|
| 119 |
+
.reset_index()
|
| 120 |
+
)
|
| 121 |
df["average"] = df[task_metrics].mean(axis=1)
|
| 122 |
for row in [*task_metrics, "average"]:
|
| 123 |
df[row] = df[row].round(2)
|
| 124 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
| 125 |
df = df.sort_values(by="speakers", ascending=False)
|
| 126 |
+
df = df[
|
| 127 |
+
[
|
| 128 |
+
"bcp_47",
|
| 129 |
+
"language_name",
|
| 130 |
+
"autonym",
|
| 131 |
+
"speakers",
|
| 132 |
+
"family",
|
| 133 |
+
"average",
|
| 134 |
+
"in_benchmark",
|
| 135 |
+
*task_metrics,
|
| 136 |
+
]
|
| 137 |
+
]
|
| 138 |
return df
|
| 139 |
|
| 140 |
+
|
| 141 |
async def main():
|
| 142 |
results = await evaluate()
|
| 143 |
results, lang_results, model_results, task_results = aggregate(results)
|
|
|
|
| 151 |
json.dump(all_results, f, indent=2, ensure_ascii=False)
|
| 152 |
|
| 153 |
datasets_df = pd.read_json("data/datasets.json")
|
| 154 |
+
language_table = make_language_table(lang_results)
|
| 155 |
all_tables = {
|
| 156 |
"model_table": serialize(make_model_table(model_results)),
|
| 157 |
+
"language_table": serialize(language_table),
|
| 158 |
"dataset_table": serialize(datasets_df),
|
| 159 |
+
"countries": make_country_table(language_table),
|
| 160 |
}
|
| 161 |
with open("frontend/public/results.json", "w") as f:
|
| 162 |
json.dump(all_tables, f, indent=2, ensure_ascii=False)
|
frontend/public/results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
results.json
CHANGED
|
@@ -616,7 +616,7 @@
|
|
| 616 |
"family": "Indo-European",
|
| 617 |
"flores_path": null,
|
| 618 |
"fleurs_tag": null,
|
| 619 |
-
"commonvoice_hours":
|
| 620 |
"commonvoice_locale": "an",
|
| 621 |
"in_benchmark": false,
|
| 622 |
"task": null,
|
|
@@ -1992,7 +1992,7 @@
|
|
| 1992 |
"family": "Indo-European",
|
| 1993 |
"flores_path": "cat_Latn",
|
| 1994 |
"fleurs_tag": "ca_es",
|
| 1995 |
-
"commonvoice_hours":
|
| 1996 |
"commonvoice_locale": "ca",
|
| 1997 |
"in_benchmark": true,
|
| 1998 |
"task": null,
|
|
@@ -2584,7 +2584,7 @@
|
|
| 2584 |
"family": "Indo-European",
|
| 2585 |
"flores_path": "deu_Latn",
|
| 2586 |
"fleurs_tag": "de_de",
|
| 2587 |
-
"commonvoice_hours":
|
| 2588 |
"commonvoice_locale": "de",
|
| 2589 |
"in_benchmark": true,
|
| 2590 |
"task": null,
|
|
@@ -2904,7 +2904,7 @@
|
|
| 2904 |
"family": "Indo-European",
|
| 2905 |
"flores_path": "eng_Latn",
|
| 2906 |
"fleurs_tag": "en_us",
|
| 2907 |
-
"commonvoice_hours":
|
| 2908 |
"commonvoice_locale": "en",
|
| 2909 |
"in_benchmark": true,
|
| 2910 |
"task": "classification",
|
|
@@ -2920,7 +2920,7 @@
|
|
| 2920 |
"family": "Indo-European",
|
| 2921 |
"flores_path": "eng_Latn",
|
| 2922 |
"fleurs_tag": "en_us",
|
| 2923 |
-
"commonvoice_hours":
|
| 2924 |
"commonvoice_locale": "en",
|
| 2925 |
"in_benchmark": true,
|
| 2926 |
"task": "language_modeling",
|
|
@@ -2936,7 +2936,7 @@
|
|
| 2936 |
"family": "Indo-European",
|
| 2937 |
"flores_path": "eng_Latn",
|
| 2938 |
"fleurs_tag": "en_us",
|
| 2939 |
-
"commonvoice_hours":
|
| 2940 |
"commonvoice_locale": "en",
|
| 2941 |
"in_benchmark": true,
|
| 2942 |
"task": "translation",
|
|
@@ -2952,7 +2952,7 @@
|
|
| 2952 |
"family": "Indo-European",
|
| 2953 |
"flores_path": "eng_Latn",
|
| 2954 |
"fleurs_tag": "en_us",
|
| 2955 |
-
"commonvoice_hours":
|
| 2956 |
"commonvoice_locale": "en",
|
| 2957 |
"in_benchmark": true,
|
| 2958 |
"task": "translation",
|
|
@@ -3320,7 +3320,7 @@
|
|
| 3320 |
"family": "Indo-European",
|
| 3321 |
"flores_path": "fra_Latn",
|
| 3322 |
"fleurs_tag": "fr_fr",
|
| 3323 |
-
"commonvoice_hours":
|
| 3324 |
"commonvoice_locale": "fr",
|
| 3325 |
"in_benchmark": true,
|
| 3326 |
"task": "classification",
|
|
@@ -3336,7 +3336,7 @@
|
|
| 3336 |
"family": "Indo-European",
|
| 3337 |
"flores_path": "fra_Latn",
|
| 3338 |
"fleurs_tag": "fr_fr",
|
| 3339 |
-
"commonvoice_hours":
|
| 3340 |
"commonvoice_locale": "fr",
|
| 3341 |
"in_benchmark": true,
|
| 3342 |
"task": "language_modeling",
|
|
@@ -3352,7 +3352,7 @@
|
|
| 3352 |
"family": "Indo-European",
|
| 3353 |
"flores_path": "fra_Latn",
|
| 3354 |
"fleurs_tag": "fr_fr",
|
| 3355 |
-
"commonvoice_hours":
|
| 3356 |
"commonvoice_locale": "fr",
|
| 3357 |
"in_benchmark": true,
|
| 3358 |
"task": "translation",
|
|
@@ -3368,7 +3368,7 @@
|
|
| 3368 |
"family": "Indo-European",
|
| 3369 |
"flores_path": "fra_Latn",
|
| 3370 |
"fleurs_tag": "fr_fr",
|
| 3371 |
-
"commonvoice_hours":
|
| 3372 |
"commonvoice_locale": "fr",
|
| 3373 |
"in_benchmark": true,
|
| 3374 |
"task": "translation",
|
|
@@ -4952,7 +4952,7 @@
|
|
| 4952 |
"family": "Abkhaz-Adyge",
|
| 4953 |
"flores_path": null,
|
| 4954 |
"fleurs_tag": null,
|
| 4955 |
-
"commonvoice_hours":
|
| 4956 |
"commonvoice_locale": "kbd",
|
| 4957 |
"in_benchmark": false,
|
| 4958 |
"task": null,
|
|
@@ -5752,7 +5752,7 @@
|
|
| 5752 |
"family": "Indo-European",
|
| 5753 |
"flores_path": null,
|
| 5754 |
"fleurs_tag": null,
|
| 5755 |
-
"commonvoice_hours":
|
| 5756 |
"commonvoice_locale": "kw",
|
| 5757 |
"in_benchmark": false,
|
| 5758 |
"task": null,
|
|
@@ -6296,7 +6296,7 @@
|
|
| 6296 |
"family": "Indo-European",
|
| 6297 |
"flores_path": "ltg_Latn",
|
| 6298 |
"fleurs_tag": null,
|
| 6299 |
-
"commonvoice_hours":
|
| 6300 |
"commonvoice_locale": "ltg",
|
| 6301 |
"in_benchmark": true,
|
| 6302 |
"task": null,
|
|
@@ -8360,7 +8360,7 @@
|
|
| 8360 |
"family": "Indo-European",
|
| 8361 |
"flores_path": null,
|
| 8362 |
"fleurs_tag": "ps_af",
|
| 8363 |
-
"commonvoice_hours":
|
| 8364 |
"commonvoice_locale": "ps",
|
| 8365 |
"in_benchmark": false,
|
| 8366 |
"task": null,
|
|
@@ -8952,7 +8952,7 @@
|
|
| 8952 |
"family": "Turkic",
|
| 8953 |
"flores_path": null,
|
| 8954 |
"fleurs_tag": null,
|
| 8955 |
-
"commonvoice_hours":
|
| 8956 |
"commonvoice_locale": "sah",
|
| 8957 |
"in_benchmark": false,
|
| 8958 |
"task": null,
|
|
@@ -9224,7 +9224,7 @@
|
|
| 9224 |
"family": null,
|
| 9225 |
"flores_path": null,
|
| 9226 |
"fleurs_tag": null,
|
| 9227 |
-
"commonvoice_hours": 1.
|
| 9228 |
"commonvoice_locale": "sei",
|
| 9229 |
"in_benchmark": false,
|
| 9230 |
"task": null,
|
|
@@ -9352,7 +9352,7 @@
|
|
| 9352 |
"family": "Indo-European",
|
| 9353 |
"flores_path": "slk_Latn",
|
| 9354 |
"fleurs_tag": "sk_sk",
|
| 9355 |
-
"commonvoice_hours":
|
| 9356 |
"commonvoice_locale": "sk",
|
| 9357 |
"in_benchmark": true,
|
| 9358 |
"task": null,
|
|
@@ -10200,7 +10200,7 @@
|
|
| 10200 |
"family": "Afro-Asiatic",
|
| 10201 |
"flores_path": null,
|
| 10202 |
"fleurs_tag": null,
|
| 10203 |
-
"commonvoice_hours":
|
| 10204 |
"commonvoice_locale": "tig",
|
| 10205 |
"in_benchmark": false,
|
| 10206 |
"task": null,
|
|
@@ -10712,7 +10712,7 @@
|
|
| 10712 |
"family": "Turkic",
|
| 10713 |
"flores_path": "uig_Arab",
|
| 10714 |
"fleurs_tag": null,
|
| 10715 |
-
"commonvoice_hours":
|
| 10716 |
"commonvoice_locale": "ug",
|
| 10717 |
"in_benchmark": true,
|
| 10718 |
"task": null,
|
|
@@ -10728,7 +10728,7 @@
|
|
| 10728 |
"family": "Indo-European",
|
| 10729 |
"flores_path": "ukr_Cyrl",
|
| 10730 |
"fleurs_tag": "uk_ua",
|
| 10731 |
-
"commonvoice_hours":
|
| 10732 |
"commonvoice_locale": "uk",
|
| 10733 |
"in_benchmark": true,
|
| 10734 |
"task": null,
|
|
|
|
| 616 |
"family": "Indo-European",
|
| 617 |
"flores_path": null,
|
| 618 |
"fleurs_tag": null,
|
| 619 |
+
"commonvoice_hours": 14.0,
|
| 620 |
"commonvoice_locale": "an",
|
| 621 |
"in_benchmark": false,
|
| 622 |
"task": null,
|
|
|
|
| 1992 |
"family": "Indo-European",
|
| 1993 |
"flores_path": "cat_Latn",
|
| 1994 |
"fleurs_tag": "ca_es",
|
| 1995 |
+
"commonvoice_hours": 2844.0,
|
| 1996 |
"commonvoice_locale": "ca",
|
| 1997 |
"in_benchmark": true,
|
| 1998 |
"task": null,
|
|
|
|
| 2584 |
"family": "Indo-European",
|
| 2585 |
"flores_path": "deu_Latn",
|
| 2586 |
"fleurs_tag": "de_de",
|
| 2587 |
+
"commonvoice_hours": 1360.0,
|
| 2588 |
"commonvoice_locale": "de",
|
| 2589 |
"in_benchmark": true,
|
| 2590 |
"task": null,
|
|
|
|
| 2904 |
"family": "Indo-European",
|
| 2905 |
"flores_path": "eng_Latn",
|
| 2906 |
"fleurs_tag": "en_us",
|
| 2907 |
+
"commonvoice_hours": 2653.0,
|
| 2908 |
"commonvoice_locale": "en",
|
| 2909 |
"in_benchmark": true,
|
| 2910 |
"task": "classification",
|
|
|
|
| 2920 |
"family": "Indo-European",
|
| 2921 |
"flores_path": "eng_Latn",
|
| 2922 |
"fleurs_tag": "en_us",
|
| 2923 |
+
"commonvoice_hours": 2653.0,
|
| 2924 |
"commonvoice_locale": "en",
|
| 2925 |
"in_benchmark": true,
|
| 2926 |
"task": "language_modeling",
|
|
|
|
| 2936 |
"family": "Indo-European",
|
| 2937 |
"flores_path": "eng_Latn",
|
| 2938 |
"fleurs_tag": "en_us",
|
| 2939 |
+
"commonvoice_hours": 2653.0,
|
| 2940 |
"commonvoice_locale": "en",
|
| 2941 |
"in_benchmark": true,
|
| 2942 |
"task": "translation",
|
|
|
|
| 2952 |
"family": "Indo-European",
|
| 2953 |
"flores_path": "eng_Latn",
|
| 2954 |
"fleurs_tag": "en_us",
|
| 2955 |
+
"commonvoice_hours": 2653.0,
|
| 2956 |
"commonvoice_locale": "en",
|
| 2957 |
"in_benchmark": true,
|
| 2958 |
"task": "translation",
|
|
|
|
| 3320 |
"family": "Indo-European",
|
| 3321 |
"flores_path": "fra_Latn",
|
| 3322 |
"fleurs_tag": "fr_fr",
|
| 3323 |
+
"commonvoice_hours": 1053.0,
|
| 3324 |
"commonvoice_locale": "fr",
|
| 3325 |
"in_benchmark": true,
|
| 3326 |
"task": "classification",
|
|
|
|
| 3336 |
"family": "Indo-European",
|
| 3337 |
"flores_path": "fra_Latn",
|
| 3338 |
"fleurs_tag": "fr_fr",
|
| 3339 |
+
"commonvoice_hours": 1053.0,
|
| 3340 |
"commonvoice_locale": "fr",
|
| 3341 |
"in_benchmark": true,
|
| 3342 |
"task": "language_modeling",
|
|
|
|
| 3352 |
"family": "Indo-European",
|
| 3353 |
"flores_path": "fra_Latn",
|
| 3354 |
"fleurs_tag": "fr_fr",
|
| 3355 |
+
"commonvoice_hours": 1053.0,
|
| 3356 |
"commonvoice_locale": "fr",
|
| 3357 |
"in_benchmark": true,
|
| 3358 |
"task": "translation",
|
|
|
|
| 3368 |
"family": "Indo-European",
|
| 3369 |
"flores_path": "fra_Latn",
|
| 3370 |
"fleurs_tag": "fr_fr",
|
| 3371 |
+
"commonvoice_hours": 1053.0,
|
| 3372 |
"commonvoice_locale": "fr",
|
| 3373 |
"in_benchmark": true,
|
| 3374 |
"task": "translation",
|
|
|
|
| 4952 |
"family": "Abkhaz-Adyge",
|
| 4953 |
"flores_path": null,
|
| 4954 |
"fleurs_tag": null,
|
| 4955 |
+
"commonvoice_hours": 18.0,
|
| 4956 |
"commonvoice_locale": "kbd",
|
| 4957 |
"in_benchmark": false,
|
| 4958 |
"task": null,
|
|
|
|
| 5752 |
"family": "Indo-European",
|
| 5753 |
"flores_path": null,
|
| 5754 |
"fleurs_tag": null,
|
| 5755 |
+
"commonvoice_hours": 3.4,
|
| 5756 |
"commonvoice_locale": "kw",
|
| 5757 |
"in_benchmark": false,
|
| 5758 |
"task": null,
|
|
|
|
| 6296 |
"family": "Indo-European",
|
| 6297 |
"flores_path": "ltg_Latn",
|
| 6298 |
"fleurs_tag": null,
|
| 6299 |
+
"commonvoice_hours": 29.0,
|
| 6300 |
"commonvoice_locale": "ltg",
|
| 6301 |
"in_benchmark": true,
|
| 6302 |
"task": null,
|
|
|
|
| 8360 |
"family": "Indo-European",
|
| 8361 |
"flores_path": null,
|
| 8362 |
"fleurs_tag": "ps_af",
|
| 8363 |
+
"commonvoice_hours": 80.0,
|
| 8364 |
"commonvoice_locale": "ps",
|
| 8365 |
"in_benchmark": false,
|
| 8366 |
"task": null,
|
|
|
|
| 8952 |
"family": "Turkic",
|
| 8953 |
"flores_path": null,
|
| 8954 |
"fleurs_tag": null,
|
| 8955 |
+
"commonvoice_hours": 11.0,
|
| 8956 |
"commonvoice_locale": "sah",
|
| 8957 |
"in_benchmark": false,
|
| 8958 |
"task": null,
|
|
|
|
| 9224 |
"family": null,
|
| 9225 |
"flores_path": null,
|
| 9226 |
"fleurs_tag": null,
|
| 9227 |
+
"commonvoice_hours": 1.4,
|
| 9228 |
"commonvoice_locale": "sei",
|
| 9229 |
"in_benchmark": false,
|
| 9230 |
"task": null,
|
|
|
|
| 9352 |
"family": "Indo-European",
|
| 9353 |
"flores_path": "slk_Latn",
|
| 9354 |
"fleurs_tag": "sk_sk",
|
| 9355 |
+
"commonvoice_hours": 47.0,
|
| 9356 |
"commonvoice_locale": "sk",
|
| 9357 |
"in_benchmark": true,
|
| 9358 |
"task": null,
|
|
|
|
| 10200 |
"family": "Afro-Asiatic",
|
| 10201 |
"flores_path": null,
|
| 10202 |
"fleurs_tag": null,
|
| 10203 |
+
"commonvoice_hours": 11.0,
|
| 10204 |
"commonvoice_locale": "tig",
|
| 10205 |
"in_benchmark": false,
|
| 10206 |
"task": null,
|
|
|
|
| 10712 |
"family": "Turkic",
|
| 10713 |
"flores_path": "uig_Arab",
|
| 10714 |
"fleurs_tag": null,
|
| 10715 |
+
"commonvoice_hours": 365.0,
|
| 10716 |
"commonvoice_locale": "ug",
|
| 10717 |
"in_benchmark": true,
|
| 10718 |
"task": null,
|
|
|
|
| 10728 |
"family": "Indo-European",
|
| 10729 |
"flores_path": "ukr_Cyrl",
|
| 10730 |
"fleurs_tag": "uk_ua",
|
| 10731 |
+
"commonvoice_hours": 99.0,
|
| 10732 |
"commonvoice_locale": "uk",
|
| 10733 |
"in_benchmark": true,
|
| 10734 |
"task": null,
|