Upload from GitHub Actions: Use FLORES+ via Huggingface
Browse files- README.md +10 -1
- evals/backend.py +1 -1
- evals/datasets_/flores.py +16 -17
- evals/download_data.py +0 -83
- evals/main.py +5 -6
- evals/models.py +17 -9
- evals/tasks.py +16 -25
- languages.json +196 -196
- models.json +0 -55
- results.json +0 -0
README.md
CHANGED
|
@@ -43,6 +43,15 @@ For tag meaning, see https://huggingface.co/spaces/leaderboards/LeaderboardsExpl
|
|
| 43 |
|
| 44 |
_Tracking language proficiency of AI models for every language_
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
```bash
|
| 47 |
-
uv run evals/
|
|
|
|
| 48 |
```
|
|
|
|
| 43 |
|
| 44 |
_Tracking language proficiency of AI models for every language_
|
| 45 |
|
| 46 |
+
## Evaluate
|
| 47 |
+
|
| 48 |
+
```bash
|
| 49 |
+
uv run --extra dev evals/main.py
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
## Explore
|
| 53 |
+
|
| 54 |
```bash
|
| 55 |
+
uv run evals/backend.py
|
| 56 |
+
cd frontend && npm i && npm start
|
| 57 |
```
|
evals/backend.py
CHANGED
|
@@ -23,7 +23,7 @@ def mean(lst):
|
|
| 23 |
task_metrics = [
|
| 24 |
"translation_from_bleu",
|
| 25 |
"translation_to_bleu",
|
| 26 |
-
|
| 27 |
"mmlu_accuracy",
|
| 28 |
]
|
| 29 |
|
|
|
|
| 23 |
task_metrics = [
|
| 24 |
"translation_from_bleu",
|
| 25 |
"translation_to_bleu",
|
| 26 |
+
"classification_accuracy",
|
| 27 |
"mmlu_accuracy",
|
| 28 |
]
|
| 29 |
|
evals/datasets_/flores.py
CHANGED
|
@@ -1,15 +1,19 @@
|
|
| 1 |
-
from langcodes import Language, standardize_tag
|
| 2 |
-
import pandas as pd
|
| 3 |
-
import os
|
| 4 |
import re
|
| 5 |
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
except FileNotFoundError:
|
| 12 |
return None
|
|
|
|
|
|
|
| 13 |
|
| 14 |
def aggregate_flores_paths(flores_paths):
|
| 15 |
# takes a list of paths from the same language but different scripts
|
|
@@ -22,20 +26,15 @@ def aggregate_flores_paths(flores_paths):
|
|
| 22 |
]
|
| 23 |
return flores_paths.values[populations.index(max(populations))]
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
columns=["flores_path"],
|
| 28 |
-
)
|
| 29 |
flores["bcp_47"] = flores["flores_path"].apply(
|
| 30 |
lambda x: standardize_tag(x, macro=True),
|
| 31 |
)
|
| 32 |
# ignore script (language is language)
|
| 33 |
flores["bcp_47"] = flores["bcp_47"].apply(
|
| 34 |
-
lambda x: re.sub(r"-[A-Z][a-
|
| 35 |
)
|
| 36 |
flores = (
|
| 37 |
-
flores.groupby("bcp_47")
|
| 38 |
-
.agg({"flores_path": aggregate_flores_paths})
|
| 39 |
-
.reset_index()
|
| 40 |
)
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import re
|
| 2 |
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 5 |
+
from langcodes import Language, standardize_tag
|
| 6 |
+
|
| 7 |
+
slug = "openlanguagedata/flores_plus"
|
| 8 |
+
splits = _get_dataset_config_names(slug)
|
| 9 |
+
splits.remove("default")
|
| 10 |
|
| 11 |
+
|
| 12 |
+
def flores_sentences(language) -> pd.DataFrame | None:
|
| 13 |
+
if language.flores_path not in splits:
|
|
|
|
| 14 |
return None
|
| 15 |
+
return _load_dataset(slug, subset=language.flores_path, split="dev").to_pandas()
|
| 16 |
+
|
| 17 |
|
| 18 |
def aggregate_flores_paths(flores_paths):
|
| 19 |
# takes a list of paths from the same language but different scripts
|
|
|
|
| 26 |
]
|
| 27 |
return flores_paths.values[populations.index(max(populations))]
|
| 28 |
|
| 29 |
+
|
| 30 |
+
flores = pd.DataFrame(splits, columns=["flores_path"])
|
|
|
|
|
|
|
| 31 |
flores["bcp_47"] = flores["flores_path"].apply(
|
| 32 |
lambda x: standardize_tag(x, macro=True),
|
| 33 |
)
|
| 34 |
# ignore script (language is language)
|
| 35 |
flores["bcp_47"] = flores["bcp_47"].apply(
|
| 36 |
+
lambda x: re.sub(r"-[A-Z][a-z0-9\-]+$", "", x)
|
| 37 |
)
|
| 38 |
flores = (
|
| 39 |
+
flores.groupby("bcp_47").agg({"flores_path": aggregate_flores_paths}).reset_index()
|
|
|
|
|
|
|
| 40 |
)
|
|
|
evals/download_data.py
CHANGED
|
@@ -24,9 +24,6 @@ DATA_DIR = project_root / "data"
|
|
| 24 |
FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
|
| 25 |
FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
|
| 26 |
|
| 27 |
-
FLORES_PLUS_HF_ID = "openlanguagedata/flores_plus"
|
| 28 |
-
FLORES_TARGET_DIR = DATA_DIR / "floresp-v2.0-rc.3" / "dev_parquet" # Note: Saving as parquet
|
| 29 |
-
|
| 30 |
GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip" # Assumed direct link from https://glottolog.org/meta/downloads
|
| 31 |
GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
|
| 32 |
GLOTTOLOG_CSV_NAME = "languoid.csv"
|
|
@@ -142,37 +139,6 @@ def download_fleurs_data():
|
|
| 142 |
else:
|
| 143 |
print(f"Found extracted audio: {audio_extracted_marker}")
|
| 144 |
|
| 145 |
-
def download_flores_plus_data():
|
| 146 |
-
"""Downloads Flores+ data using Hugging Face datasets library."""
|
| 147 |
-
print("\n--- Downloading Flores+ Data (requires HF login & accepted terms) ---")
|
| 148 |
-
FLORES_TARGET_DIR.mkdir(parents=True, exist_ok=True)
|
| 149 |
-
|
| 150 |
-
try:
|
| 151 |
-
# Check login status first
|
| 152 |
-
token = huggingface_hub.HfFolder.get_token()
|
| 153 |
-
if not token:
|
| 154 |
-
print("Hugging Face token not found. Please log in using `huggingface-cli login`.")
|
| 155 |
-
print("You also need to accept the terms for 'openlanguagedata/flores_plus' on the HF website.")
|
| 156 |
-
return
|
| 157 |
-
|
| 158 |
-
print(f"Attempting to download '{FLORES_PLUS_HF_ID}' (dev split)...")
|
| 159 |
-
# Load only the 'dev' split
|
| 160 |
-
ds = load_dataset(FLORES_PLUS_HF_ID, split='dev', verification_mode='no_checks')
|
| 161 |
-
|
| 162 |
-
# Save as parquet files, potentially one per language if needed later
|
| 163 |
-
# For simplicity now, save the whole dev split as one parquet file
|
| 164 |
-
target_file = FLORES_TARGET_DIR / "dev_split.parquet"
|
| 165 |
-
print(f"Saving dev split to {target_file}...")
|
| 166 |
-
ds.to_parquet(target_file)
|
| 167 |
-
print("Flores+ dev split downloaded and saved as parquet.")
|
| 168 |
-
|
| 169 |
-
except huggingface_hub.utils.GatedRepoError:
|
| 170 |
-
print(f"Error: Access to '{FLORES_PLUS_HF_ID}' is gated.")
|
| 171 |
-
print("Please ensure you are logged in (`huggingface-cli login`) and have accepted the terms ")
|
| 172 |
-
print(f"on the dataset page: https://huggingface.co/datasets/{FLORES_PLUS_HF_ID}")
|
| 173 |
-
except Exception as e:
|
| 174 |
-
print(f"An error occurred downloading or saving Flores+: {e}")
|
| 175 |
-
|
| 176 |
|
| 177 |
def download_glottolog_data():
|
| 178 |
"""Downloads and extracts Glottolog languoid CSV."""
|
|
@@ -227,53 +193,6 @@ def download_spbleu_data():
|
|
| 227 |
else:
|
| 228 |
print(f"Found: {target_dict_file}")
|
| 229 |
|
| 230 |
-
# --- Conversion Function ---
|
| 231 |
-
|
| 232 |
-
def convert_flores_parquet_to_text():
|
| 233 |
-
"""Converts the downloaded Flores+ parquet dev split to text files."""
|
| 234 |
-
print("\n--- Converting Flores+ Parquet to Text Files ---")
|
| 235 |
-
parquet_file = FLORES_TARGET_DIR / "dev_split.parquet"
|
| 236 |
-
text_dir = project_root / "data" / "floresp-v2.0-rc.3" / "dev" # Original expected dir
|
| 237 |
-
|
| 238 |
-
if not parquet_file.exists():
|
| 239 |
-
print(f"Parquet file not found: {parquet_file}. Skipping conversion.")
|
| 240 |
-
return
|
| 241 |
-
|
| 242 |
-
try:
|
| 243 |
-
print(f"Reading parquet file: {parquet_file}")
|
| 244 |
-
df = pd.read_parquet(parquet_file)
|
| 245 |
-
print(f"Read {len(df)} rows from parquet.")
|
| 246 |
-
|
| 247 |
-
if not all(col in df.columns for col in ['iso_639_3', 'iso_15924', 'text']):
|
| 248 |
-
print("Error: Parquet file missing required columns (iso_639_3, iso_15924, text).")
|
| 249 |
-
return
|
| 250 |
-
|
| 251 |
-
text_dir.mkdir(parents=True, exist_ok=True)
|
| 252 |
-
print(f"Target directory for text files: {text_dir}")
|
| 253 |
-
|
| 254 |
-
# Group by language and script to create individual files
|
| 255 |
-
grouped = df.groupby(['iso_639_3', 'iso_15924'])
|
| 256 |
-
count = 0
|
| 257 |
-
for (lang, script), group in grouped:
|
| 258 |
-
target_filename = f"dev.{lang}_{script}"
|
| 259 |
-
target_path = text_dir / target_filename
|
| 260 |
-
print(f"Writing {len(group)} sentences to {target_path}...")
|
| 261 |
-
try:
|
| 262 |
-
with open(target_path, 'w', encoding='utf-8') as f:
|
| 263 |
-
for sentence in group['text']:
|
| 264 |
-
f.write(sentence + '\n')
|
| 265 |
-
count += 1
|
| 266 |
-
except Exception as e:
|
| 267 |
-
print(f"Error writing file {target_path}: {e}")
|
| 268 |
-
|
| 269 |
-
print(f"Successfully wrote {count} language/script files to {text_dir}.")
|
| 270 |
-
|
| 271 |
-
except ImportError:
|
| 272 |
-
print("Error: pandas or pyarrow might be missing. Cannot read parquet.")
|
| 273 |
-
print("Please install them: pip install pandas pyarrow")
|
| 274 |
-
except Exception as e:
|
| 275 |
-
print(f"An error occurred during parquet conversion: {e}")
|
| 276 |
-
|
| 277 |
|
| 278 |
# --- Main Execution ---
|
| 279 |
|
|
@@ -282,8 +201,6 @@ def main():
|
|
| 282 |
print("Starting data download process...")
|
| 283 |
DATA_DIR.mkdir(exist_ok=True)
|
| 284 |
|
| 285 |
-
download_flores_plus_data()
|
| 286 |
-
convert_flores_parquet_to_text()
|
| 287 |
#download_fleurs_data()
|
| 288 |
download_glottolog_data()
|
| 289 |
download_scriptcodes_data()
|
|
|
|
| 24 |
FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
|
| 25 |
FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
|
| 26 |
|
|
|
|
|
|
|
|
|
|
| 27 |
GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip" # Assumed direct link from https://glottolog.org/meta/downloads
|
| 28 |
GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
|
| 29 |
GLOTTOLOG_CSV_NAME = "languoid.csv"
|
|
|
|
| 139 |
else:
|
| 140 |
print(f"Found extracted audio: {audio_extracted_marker}")
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
def download_glottolog_data():
|
| 144 |
"""Downloads and extracts Glottolog languoid CSV."""
|
|
|
|
| 193 |
else:
|
| 194 |
print(f"Found: {target_dict_file}")
|
| 195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
# --- Main Execution ---
|
| 198 |
|
|
|
|
| 201 |
print("Starting data download process...")
|
| 202 |
DATA_DIR.mkdir(exist_ok=True)
|
| 203 |
|
|
|
|
|
|
|
| 204 |
#download_fleurs_data()
|
| 205 |
download_glottolog_data()
|
| 206 |
download_scriptcodes_data()
|
evals/main.py
CHANGED
|
@@ -9,8 +9,8 @@ from tqdm.asyncio import tqdm_asyncio
|
|
| 9 |
# ===== config =====
|
| 10 |
|
| 11 |
n_sentences = 10
|
| 12 |
-
n_languages =
|
| 13 |
-
n_models =
|
| 14 |
|
| 15 |
# ===== run evaluation and aggregate results =====
|
| 16 |
|
|
@@ -31,9 +31,8 @@ async def evaluate():
|
|
| 31 |
]
|
| 32 |
# filter out combinations that have already been evaluated
|
| 33 |
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
|
| 34 |
-
combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
|
| 35 |
-
combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
|
| 36 |
-
print(combis["model"].unique())
|
| 37 |
# run evaluations
|
| 38 |
results = [
|
| 39 |
tasks[task_name](model, bcp_47, i)
|
|
@@ -51,7 +50,7 @@ async def evaluate():
|
|
| 51 |
.reset_index()
|
| 52 |
)
|
| 53 |
# save results
|
| 54 |
-
results = pd.concat([old_results, results])
|
| 55 |
results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
|
| 56 |
results.to_json("results.json", **args)
|
| 57 |
|
|
|
|
| 9 |
# ===== config =====
|
| 10 |
|
| 11 |
n_sentences = 10
|
| 12 |
+
n_languages = 10
|
| 13 |
+
n_models = 10
|
| 14 |
|
| 15 |
# ===== run evaluation and aggregate results =====
|
| 16 |
|
|
|
|
| 31 |
]
|
| 32 |
# filter out combinations that have already been evaluated
|
| 33 |
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
|
| 34 |
+
# combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
|
| 35 |
+
# combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
|
|
|
|
| 36 |
# run evaluations
|
| 37 |
results = [
|
| 38 |
tasks[task_name](model, bcp_47, i)
|
|
|
|
| 50 |
.reset_index()
|
| 51 |
)
|
| 52 |
# save results
|
| 53 |
+
# results = pd.concat([old_results, results])
|
| 54 |
results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
|
| 55 |
results.to_json("results.json", **args)
|
| 56 |
|
evals/models.py
CHANGED
|
@@ -12,6 +12,7 @@ from huggingface_hub import AsyncInferenceClient, HfApi
|
|
| 12 |
from joblib.memory import Memory
|
| 13 |
from openai import AsyncOpenAI
|
| 14 |
from requests import HTTPError, get
|
|
|
|
| 15 |
|
| 16 |
# for development purposes, all languages will be evaluated on the fast models
|
| 17 |
# and only a sample of languages will be evaluated on all models
|
|
@@ -111,11 +112,17 @@ huggingface_rate_limit = AsyncLimiter(max_rate=5, time_period=1)
|
|
| 111 |
@cache
|
| 112 |
async def complete(**kwargs):
|
| 113 |
async with openrouter_rate_limit:
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
if not response.choices:
|
| 116 |
raise Exception(response)
|
| 117 |
-
return response
|
| 118 |
-
|
| 119 |
|
| 120 |
@cache
|
| 121 |
async def transcribe_elevenlabs(path, model):
|
|
@@ -199,12 +206,13 @@ def get_cost(row):
|
|
| 199 |
|
| 200 |
@cache
|
| 201 |
def load_models(date: date):
|
| 202 |
-
popular_models = (
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
)
|
| 206 |
-
popular_models = [m["slug"] for m in popular_models]
|
| 207 |
-
models = set(important_models + popular_models) - set(blocklist)
|
|
|
|
| 208 |
models = pd.DataFrame(sorted(list(models)), columns=["id"])
|
| 209 |
or_metadata = models["id"].apply(get_or_metadata)
|
| 210 |
hf_metadata = or_metadata.apply(get_hf_metadata)
|
|
|
|
| 12 |
from joblib.memory import Memory
|
| 13 |
from openai import AsyncOpenAI
|
| 14 |
from requests import HTTPError, get
|
| 15 |
+
from openai import PermissionDeniedError
|
| 16 |
|
| 17 |
# for development purposes, all languages will be evaluated on the fast models
|
| 18 |
# and only a sample of languages will be evaluated on all models
|
|
|
|
| 112 |
@cache
|
| 113 |
async def complete(**kwargs):
|
| 114 |
async with openrouter_rate_limit:
|
| 115 |
+
try:
|
| 116 |
+
response = await client.chat.completions.create(**kwargs)
|
| 117 |
+
except PermissionDeniedError as e:
|
| 118 |
+
if e["error"]["metadata"]["reason"] in ["violence", "hate", "sexual", "self-harm", "harassment"]:
|
| 119 |
+
print(e)
|
| 120 |
+
return None
|
| 121 |
+
else:
|
| 122 |
+
raise e
|
| 123 |
if not response.choices:
|
| 124 |
raise Exception(response)
|
| 125 |
+
return response.choices[0].message.content.strip()
|
|
|
|
| 126 |
|
| 127 |
@cache
|
| 128 |
async def transcribe_elevenlabs(path, model):
|
|
|
|
| 206 |
|
| 207 |
@cache
|
| 208 |
def load_models(date: date):
|
| 209 |
+
# popular_models = (
|
| 210 |
+
# get_historical_popular_models(date.today())[:15]
|
| 211 |
+
# + get_current_popular_models(date.today())[:15]
|
| 212 |
+
# )
|
| 213 |
+
# popular_models = [m["slug"] for m in popular_models]
|
| 214 |
+
# models = set(important_models + popular_models) - set(blocklist)
|
| 215 |
+
models = set(important_models) - set(blocklist)
|
| 216 |
models = pd.DataFrame(sorted(list(models)), columns=["id"])
|
| 217 |
or_metadata = models["id"].apply(get_or_metadata)
|
| 218 |
hf_metadata = or_metadata.apply(get_hf_metadata)
|
evals/tasks.py
CHANGED
|
@@ -30,12 +30,12 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
| 30 |
pass
|
| 31 |
case "to":
|
| 32 |
original_language, target_language = target_language, original_language
|
| 33 |
-
if
|
| 34 |
return []
|
| 35 |
-
original_sentence = flores_sentences(original_language)[sentence_nr].strip()
|
| 36 |
-
target_sentence = flores_sentences(target_language)[sentence_nr].strip()
|
| 37 |
script = script_name(target_language.flores_path.split("_")[1])
|
| 38 |
-
|
| 39 |
model=model,
|
| 40 |
messages=[
|
| 41 |
{
|
|
@@ -46,8 +46,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
| 46 |
temperature=0,
|
| 47 |
max_tokens=1024,
|
| 48 |
)
|
| 49 |
-
prediction
|
| 50 |
-
if prediction.strip():
|
| 51 |
bleu_score = bleu.compute(
|
| 52 |
predictions=[prediction],
|
| 53 |
references=[target_sentence],
|
|
@@ -71,21 +70,15 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
| 71 |
)
|
| 72 |
]
|
| 73 |
|
| 74 |
-
|
| 75 |
-
# metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
|
| 76 |
-
|
| 77 |
-
|
| 78 |
async def classify_and_evaluate(model, bcp_47, nr):
|
| 79 |
language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
| 80 |
sentences = flores_sentences(language)
|
| 81 |
-
if
|
| 82 |
return []
|
| 83 |
-
sentences = pd.DataFrame(sentences, columns=["text"])
|
| 84 |
-
sentences = pd.concat([metadata, sentences], axis=1)
|
| 85 |
sentences = sentences.dropna(subset=["topic"])
|
| 86 |
sentences["topic"] = sentences["topic"].str.lower()
|
| 87 |
paragraphs = (
|
| 88 |
-
sentences.groupby("
|
| 89 |
)
|
| 90 |
top_topics = paragraphs.value_counts("topic").head(5).index
|
| 91 |
paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
|
|
@@ -95,7 +88,7 @@ async def classify_and_evaluate(model, bcp_47, nr):
|
|
| 95 |
for t in top_topics
|
| 96 |
]
|
| 97 |
).sample(frac=1, random_state=nr)
|
| 98 |
-
test_paragraphs = paragraphs[~paragraphs["
|
| 99 |
frac=1, random_state=42
|
| 100 |
)
|
| 101 |
test_paragraph = test_paragraphs.iloc[nr]
|
|
@@ -112,7 +105,7 @@ async def classify_and_evaluate(model, bcp_47, nr):
|
|
| 112 |
# some models have poor tokenization for some languages, and the prompt for this task is relatively long, so it sometimes exceeds the context window
|
| 113 |
# this is not just to blame on the context window but mostly on the model's tokenization, so we assign 0 accuracy in this case
|
| 114 |
try:
|
| 115 |
-
|
| 116 |
model=model,
|
| 117 |
messages=[
|
| 118 |
*messages,
|
|
@@ -124,12 +117,11 @@ async def classify_and_evaluate(model, bcp_47, nr):
|
|
| 124 |
temperature=0,
|
| 125 |
max_tokens=30,
|
| 126 |
)
|
| 127 |
-
response = reply.choices[0].message.content.strip().lower()
|
| 128 |
true = test_paragraph.topic
|
| 129 |
others = [t for t in top_topics if t != true]
|
| 130 |
acc = int(
|
| 131 |
-
|
| 132 |
-
or (true in
|
| 133 |
)
|
| 134 |
except Exception as e:
|
| 135 |
if "`inputs` tokens + `max_new_tokens` must be <= 4097" in str(e):
|
|
@@ -160,7 +152,7 @@ def corrupt_sentence(sentence):
|
|
| 160 |
async def mlm_and_evaluate(model, language_bcp_47, nr):
|
| 161 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
| 162 |
sentences = flores_sentences(language)
|
| 163 |
-
if
|
| 164 |
return []
|
| 165 |
sentences = pd.DataFrame(sentences, columns=["text"])
|
| 166 |
sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence)
|
|
@@ -175,7 +167,7 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
|
|
| 175 |
{"role": "user", "content": example.corrupt_text},
|
| 176 |
{"role": "assistant", "content": example.text},
|
| 177 |
]
|
| 178 |
-
|
| 179 |
model=model,
|
| 180 |
messages=[
|
| 181 |
*messages,
|
|
@@ -187,7 +179,6 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
|
|
| 187 |
temperature=0,
|
| 188 |
max_tokens=1024,
|
| 189 |
)
|
| 190 |
-
prediction = reply.choices[0].message.content.strip()
|
| 191 |
chrf_score = chrf.compute(predictions=[prediction], references=[test_sentence.text])
|
| 192 |
return [
|
| 193 |
{
|
|
@@ -224,13 +215,13 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
| 224 |
]
|
| 225 |
messages += [{"role": "user", "content": format_item(task)}]
|
| 226 |
try:
|
| 227 |
-
|
| 228 |
model=model,
|
| 229 |
messages=messages,
|
| 230 |
temperature=0,
|
| 231 |
max_tokens=1,
|
| 232 |
)
|
| 233 |
-
acc = int(
|
| 234 |
except Exception as e:
|
| 235 |
if "ResponsibleAIPolicyViolation" in str(e):
|
| 236 |
acc = 0
|
|
@@ -282,7 +273,7 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
|
| 282 |
tasks = {
|
| 283 |
"translation_from": partial(translate_and_evaluate, mode="from"),
|
| 284 |
"translation_to": partial(translate_and_evaluate, mode="to"),
|
| 285 |
-
|
| 286 |
# "mlm": mlm_and_evaluate,
|
| 287 |
"mmlu": mmlu_and_evaluate,
|
| 288 |
# "asr": transcribe_and_evaluate,
|
|
|
|
| 30 |
pass
|
| 31 |
case "to":
|
| 32 |
original_language, target_language = target_language, original_language
|
| 33 |
+
if flores_sentences(original_language) is None or flores_sentences(target_language) is None:
|
| 34 |
return []
|
| 35 |
+
original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
|
| 36 |
+
target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
|
| 37 |
script = script_name(target_language.flores_path.split("_")[1])
|
| 38 |
+
prediction = await complete(
|
| 39 |
model=model,
|
| 40 |
messages=[
|
| 41 |
{
|
|
|
|
| 46 |
temperature=0,
|
| 47 |
max_tokens=1024,
|
| 48 |
)
|
| 49 |
+
if prediction:
|
|
|
|
| 50 |
bleu_score = bleu.compute(
|
| 51 |
predictions=[prediction],
|
| 52 |
references=[target_sentence],
|
|
|
|
| 70 |
)
|
| 71 |
]
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
async def classify_and_evaluate(model, bcp_47, nr):
|
| 74 |
language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
| 75 |
sentences = flores_sentences(language)
|
| 76 |
+
if sentences is None:
|
| 77 |
return []
|
|
|
|
|
|
|
| 78 |
sentences = sentences.dropna(subset=["topic"])
|
| 79 |
sentences["topic"] = sentences["topic"].str.lower()
|
| 80 |
paragraphs = (
|
| 81 |
+
sentences.groupby("url").agg({"text": " ".join, "topic": "first"}).reset_index()
|
| 82 |
)
|
| 83 |
top_topics = paragraphs.value_counts("topic").head(5).index
|
| 84 |
paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
|
|
|
|
| 88 |
for t in top_topics
|
| 89 |
]
|
| 90 |
).sample(frac=1, random_state=nr)
|
| 91 |
+
test_paragraphs = paragraphs[~paragraphs["url"].isin(examples["url"])].sample(
|
| 92 |
frac=1, random_state=42
|
| 93 |
)
|
| 94 |
test_paragraph = test_paragraphs.iloc[nr]
|
|
|
|
| 105 |
# some models have poor tokenization for some languages, and the prompt for this task is relatively long, so it sometimes exceeds the context window
|
| 106 |
# this is not just to blame on the context window but mostly on the model's tokenization, so we assign 0 accuracy in this case
|
| 107 |
try:
|
| 108 |
+
pred = await complete(
|
| 109 |
model=model,
|
| 110 |
messages=[
|
| 111 |
*messages,
|
|
|
|
| 117 |
temperature=0,
|
| 118 |
max_tokens=30,
|
| 119 |
)
|
|
|
|
| 120 |
true = test_paragraph.topic
|
| 121 |
others = [t for t in top_topics if t != true]
|
| 122 |
acc = int(
|
| 123 |
+
pred.startswith(true)
|
| 124 |
+
or (true in pred and not any(o in pred for o in others))
|
| 125 |
)
|
| 126 |
except Exception as e:
|
| 127 |
if "`inputs` tokens + `max_new_tokens` must be <= 4097" in str(e):
|
|
|
|
| 152 |
async def mlm_and_evaluate(model, language_bcp_47, nr):
|
| 153 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
| 154 |
sentences = flores_sentences(language)
|
| 155 |
+
if sentences is None:
|
| 156 |
return []
|
| 157 |
sentences = pd.DataFrame(sentences, columns=["text"])
|
| 158 |
sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence)
|
|
|
|
| 167 |
{"role": "user", "content": example.corrupt_text},
|
| 168 |
{"role": "assistant", "content": example.text},
|
| 169 |
]
|
| 170 |
+
prediction = await complete(
|
| 171 |
model=model,
|
| 172 |
messages=[
|
| 173 |
*messages,
|
|
|
|
| 179 |
temperature=0,
|
| 180 |
max_tokens=1024,
|
| 181 |
)
|
|
|
|
| 182 |
chrf_score = chrf.compute(predictions=[prediction], references=[test_sentence.text])
|
| 183 |
return [
|
| 184 |
{
|
|
|
|
| 215 |
]
|
| 216 |
messages += [{"role": "user", "content": format_item(task)}]
|
| 217 |
try:
|
| 218 |
+
response = await complete(
|
| 219 |
model=model,
|
| 220 |
messages=messages,
|
| 221 |
temperature=0,
|
| 222 |
max_tokens=1,
|
| 223 |
)
|
| 224 |
+
acc = int(response[:1].strip() == task["answer"])
|
| 225 |
except Exception as e:
|
| 226 |
if "ResponsibleAIPolicyViolation" in str(e):
|
| 227 |
acc = 0
|
|
|
|
| 273 |
tasks = {
|
| 274 |
"translation_from": partial(translate_and_evaluate, mode="from"),
|
| 275 |
"translation_to": partial(translate_and_evaluate, mode="to"),
|
| 276 |
+
"classification": classify_and_evaluate,
|
| 277 |
# "mlm": mlm_and_evaluate,
|
| 278 |
"mmlu": mmlu_and_evaluate,
|
| 279 |
# "asr": transcribe_and_evaluate,
|
languages.json
CHANGED
|
@@ -485,7 +485,7 @@
|
|
| 485 |
"language_name":"North Levantine Arabic",
|
| 486 |
"autonym":"العامية",
|
| 487 |
"family":"Afro-Asiatic",
|
| 488 |
-
"flores_path":"
|
| 489 |
"fleurs_tag":null,
|
| 490 |
"commonvoice_hours":null,
|
| 491 |
"commonvoice_locale":null,
|
|
@@ -876,10 +876,10 @@
|
|
| 876 |
"in_benchmark":true
|
| 877 |
},
|
| 878 |
{
|
| 879 |
-
"bcp_47":"
|
| 880 |
"speakers":15913080,
|
| 881 |
-
"language_name":"
|
| 882 |
-
"autonym":"
|
| 883 |
"family":"Indo-European",
|
| 884 |
"flores_path":null,
|
| 885 |
"fleurs_tag":null,
|
|
@@ -888,10 +888,10 @@
|
|
| 888 |
"in_benchmark":false
|
| 889 |
},
|
| 890 |
{
|
| 891 |
-
"bcp_47":"
|
| 892 |
"speakers":15913080,
|
| 893 |
-
"language_name":"
|
| 894 |
-
"autonym":"
|
| 895 |
"family":"Indo-European",
|
| 896 |
"flores_path":null,
|
| 897 |
"fleurs_tag":null,
|
|
@@ -1073,7 +1073,7 @@
|
|
| 1073 |
"language_name":"Akan",
|
| 1074 |
"autonym":"Akan",
|
| 1075 |
"family":"Atlantic-Congo",
|
| 1076 |
-
"flores_path":"
|
| 1077 |
"fleurs_tag":null,
|
| 1078 |
"commonvoice_hours":0.2,
|
| 1079 |
"commonvoice_locale":"tw",
|
|
@@ -1171,7 +1171,7 @@
|
|
| 1171 |
"family":"Afro-Asiatic",
|
| 1172 |
"flores_path":"tir_Ethi",
|
| 1173 |
"fleurs_tag":null,
|
| 1174 |
-
"commonvoice_hours":0.
|
| 1175 |
"commonvoice_locale":"ti",
|
| 1176 |
"in_benchmark":true
|
| 1177 |
},
|
|
@@ -1195,7 +1195,7 @@
|
|
| 1195 |
"family":"Atlantic-Congo",
|
| 1196 |
"flores_path":"lua_Latn",
|
| 1197 |
"fleurs_tag":null,
|
| 1198 |
-
"commonvoice_hours":
|
| 1199 |
"commonvoice_locale":"lua",
|
| 1200 |
"in_benchmark":true
|
| 1201 |
},
|
|
@@ -1955,18 +1955,6 @@
|
|
| 1955 |
"commonvoice_locale":"gom",
|
| 1956 |
"in_benchmark":true
|
| 1957 |
},
|
| 1958 |
-
{
|
| 1959 |
-
"bcp_47":"kln",
|
| 1960 |
-
"speakers":4068120,
|
| 1961 |
-
"language_name":"Kalenjin",
|
| 1962 |
-
"autonym":"Kalenjin",
|
| 1963 |
-
"family":"Nilotic",
|
| 1964 |
-
"flores_path":null,
|
| 1965 |
-
"fleurs_tag":null,
|
| 1966 |
-
"commonvoice_hours":43.0,
|
| 1967 |
-
"commonvoice_locale":"kln",
|
| 1968 |
-
"in_benchmark":false
|
| 1969 |
-
},
|
| 1970 |
{
|
| 1971 |
"bcp_47":"kam",
|
| 1972 |
"speakers":4068120,
|
|
@@ -1979,6 +1967,18 @@
|
|
| 1979 |
"commonvoice_locale":"kam",
|
| 1980 |
"in_benchmark":true
|
| 1981 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1982 |
{
|
| 1983 |
"bcp_47":"bjn",
|
| 1984 |
"speakers":4010288,
|
|
@@ -2124,10 +2124,10 @@
|
|
| 2124 |
"in_benchmark":true
|
| 2125 |
},
|
| 2126 |
{
|
| 2127 |
-
"bcp_47":"
|
| 2128 |
"speakers":3580443,
|
| 2129 |
-
"language_name":"
|
| 2130 |
-
"autonym":"
|
| 2131 |
"family":"Indo-European",
|
| 2132 |
"flores_path":null,
|
| 2133 |
"fleurs_tag":null,
|
|
@@ -2136,10 +2136,10 @@
|
|
| 2136 |
"in_benchmark":false
|
| 2137 |
},
|
| 2138 |
{
|
| 2139 |
-
"bcp_47":"
|
| 2140 |
"speakers":3580443,
|
| 2141 |
-
"language_name":"
|
| 2142 |
-
"autonym":"
|
| 2143 |
"family":"Indo-European",
|
| 2144 |
"flores_path":null,
|
| 2145 |
"fleurs_tag":null,
|
|
@@ -2352,27 +2352,27 @@
|
|
| 2352 |
"in_benchmark":true
|
| 2353 |
},
|
| 2354 |
{
|
| 2355 |
-
"bcp_47":"
|
| 2356 |
"speakers":2996392,
|
| 2357 |
-
"language_name":"
|
| 2358 |
-
"autonym":"
|
| 2359 |
"family":"Atlantic-Congo",
|
| 2360 |
"flores_path":null,
|
| 2361 |
"fleurs_tag":null,
|
| 2362 |
-
"commonvoice_hours":
|
| 2363 |
-
"commonvoice_locale":
|
| 2364 |
"in_benchmark":false
|
| 2365 |
},
|
| 2366 |
{
|
| 2367 |
-
"bcp_47":"
|
| 2368 |
"speakers":2996392,
|
| 2369 |
-
"language_name":"
|
| 2370 |
-
"autonym":"
|
| 2371 |
"family":"Atlantic-Congo",
|
| 2372 |
"flores_path":null,
|
| 2373 |
"fleurs_tag":null,
|
| 2374 |
-
"commonvoice_hours":
|
| 2375 |
-
"commonvoice_locale":
|
| 2376 |
"in_benchmark":false
|
| 2377 |
},
|
| 2378 |
{
|
|
@@ -2544,11 +2544,11 @@
|
|
| 2544 |
"in_benchmark":false
|
| 2545 |
},
|
| 2546 |
{
|
| 2547 |
-
"bcp_47":"
|
| 2548 |
"speakers":2386962,
|
| 2549 |
-
"language_name":"
|
| 2550 |
-
"autonym":"
|
| 2551 |
-
"family":"
|
| 2552 |
"flores_path":null,
|
| 2553 |
"fleurs_tag":null,
|
| 2554 |
"commonvoice_hours":null,
|
|
@@ -2556,11 +2556,11 @@
|
|
| 2556 |
"in_benchmark":false
|
| 2557 |
},
|
| 2558 |
{
|
| 2559 |
-
"bcp_47":"
|
| 2560 |
"speakers":2386962,
|
| 2561 |
-
"language_name":"
|
| 2562 |
-
"autonym":"
|
| 2563 |
-
"family":"
|
| 2564 |
"flores_path":null,
|
| 2565 |
"fleurs_tag":null,
|
| 2566 |
"commonvoice_hours":null,
|
|
@@ -2724,10 +2724,10 @@
|
|
| 2724 |
"in_benchmark":false
|
| 2725 |
},
|
| 2726 |
{
|
| 2727 |
-
"bcp_47":"
|
| 2728 |
"speakers":1989135,
|
| 2729 |
-
"language_name":"
|
| 2730 |
-
"autonym":"
|
| 2731 |
"family":"Indo-European",
|
| 2732 |
"flores_path":null,
|
| 2733 |
"fleurs_tag":null,
|
|
@@ -2748,10 +2748,10 @@
|
|
| 2748 |
"in_benchmark":false
|
| 2749 |
},
|
| 2750 |
{
|
| 2751 |
-
"bcp_47":"
|
| 2752 |
"speakers":1989135,
|
| 2753 |
-
"language_name":"
|
| 2754 |
-
"autonym":"
|
| 2755 |
"family":"Indo-European",
|
| 2756 |
"flores_path":null,
|
| 2757 |
"fleurs_tag":null,
|
|
@@ -3535,7 +3535,7 @@
|
|
| 3535 |
"family":null,
|
| 3536 |
"flores_path":"eus_Latn",
|
| 3537 |
"fleurs_tag":null,
|
| 3538 |
-
"commonvoice_hours":
|
| 3539 |
"commonvoice_locale":"eu",
|
| 3540 |
"in_benchmark":true
|
| 3541 |
},
|
|
@@ -3559,7 +3559,7 @@
|
|
| 3559 |
"family":"Abkhaz-Adyge",
|
| 3560 |
"flores_path":null,
|
| 3561 |
"fleurs_tag":null,
|
| 3562 |
-
"commonvoice_hours":
|
| 3563 |
"commonvoice_locale":"kbd",
|
| 3564 |
"in_benchmark":false
|
| 3565 |
},
|
|
@@ -3684,10 +3684,10 @@
|
|
| 3684 |
"in_benchmark":true
|
| 3685 |
},
|
| 3686 |
{
|
| 3687 |
-
"bcp_47":"
|
| 3688 |
"speakers":995398,
|
| 3689 |
-
"language_name":"
|
| 3690 |
-
"autonym":"
|
| 3691 |
"family":"Atlantic-Congo",
|
| 3692 |
"flores_path":null,
|
| 3693 |
"fleurs_tag":null,
|
|
@@ -3696,10 +3696,10 @@
|
|
| 3696 |
"in_benchmark":false
|
| 3697 |
},
|
| 3698 |
{
|
| 3699 |
-
"bcp_47":"
|
| 3700 |
"speakers":995398,
|
| 3701 |
-
"language_name":"
|
| 3702 |
-
"autonym":"
|
| 3703 |
"family":"Atlantic-Congo",
|
| 3704 |
"flores_path":null,
|
| 3705 |
"fleurs_tag":null,
|
|
@@ -4512,27 +4512,27 @@
|
|
| 4512 |
"in_benchmark":false
|
| 4513 |
},
|
| 4514 |
{
|
| 4515 |
-
"bcp_47":"
|
| 4516 |
"speakers":501735,
|
| 4517 |
-
"language_name":"
|
| 4518 |
-
"autonym":"
|
| 4519 |
"family":"Uto-Aztecan",
|
| 4520 |
"flores_path":null,
|
| 4521 |
"fleurs_tag":null,
|
| 4522 |
-
"commonvoice_hours":
|
| 4523 |
-
"commonvoice_locale":
|
| 4524 |
"in_benchmark":false
|
| 4525 |
},
|
| 4526 |
{
|
| 4527 |
-
"bcp_47":"
|
| 4528 |
"speakers":501735,
|
| 4529 |
-
"language_name":"
|
| 4530 |
-
"autonym":"
|
| 4531 |
"family":"Uto-Aztecan",
|
| 4532 |
"flores_path":null,
|
| 4533 |
"fleurs_tag":null,
|
| 4534 |
-
"commonvoice_hours":
|
| 4535 |
-
"commonvoice_locale":
|
| 4536 |
"in_benchmark":false
|
| 4537 |
},
|
| 4538 |
{
|
|
@@ -4553,11 +4553,11 @@
|
|
| 4553 |
"language_name":"Kara-Kalpak",
|
| 4554 |
"autonym":"Kara-Kalpak",
|
| 4555 |
"family":"Turkic",
|
| 4556 |
-
"flores_path":
|
| 4557 |
"fleurs_tag":null,
|
| 4558 |
"commonvoice_hours":0.0,
|
| 4559 |
"commonvoice_locale":"kaa",
|
| 4560 |
-
"in_benchmark":
|
| 4561 |
},
|
| 4562 |
{
|
| 4563 |
"bcp_47":"gju",
|
|
@@ -4715,18 +4715,6 @@
|
|
| 4715 |
"commonvoice_locale":null,
|
| 4716 |
"in_benchmark":false
|
| 4717 |
},
|
| 4718 |
-
{
|
| 4719 |
-
"bcp_47":"jmc",
|
| 4720 |
-
"speakers":433291,
|
| 4721 |
-
"language_name":"Machame",
|
| 4722 |
-
"autonym":"Kimachame",
|
| 4723 |
-
"family":"Atlantic-Congo",
|
| 4724 |
-
"flores_path":null,
|
| 4725 |
-
"fleurs_tag":null,
|
| 4726 |
-
"commonvoice_hours":null,
|
| 4727 |
-
"commonvoice_locale":null,
|
| 4728 |
-
"in_benchmark":false
|
| 4729 |
-
},
|
| 4730 |
{
|
| 4731 |
"bcp_47":"vun",
|
| 4732 |
"speakers":433291,
|
|
@@ -4747,10 +4735,22 @@
|
|
| 4747 |
"family":"Atlantic-Congo",
|
| 4748 |
"flores_path":null,
|
| 4749 |
"fleurs_tag":null,
|
| 4750 |
-
"commonvoice_hours":
|
| 4751 |
"commonvoice_locale":"rof",
|
| 4752 |
"in_benchmark":false
|
| 4753 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4754 |
{
|
| 4755 |
"bcp_47":"kjg",
|
| 4756 |
"speakers":431949,
|
|
@@ -5124,27 +5124,27 @@
|
|
| 5124 |
"in_benchmark":false
|
| 5125 |
},
|
| 5126 |
{
|
| 5127 |
-
"bcp_47":"
|
| 5128 |
"speakers":332940,
|
| 5129 |
-
"language_name":"
|
| 5130 |
-
"autonym":"
|
| 5131 |
"family":"Atlantic-Congo",
|
| 5132 |
"flores_path":null,
|
| 5133 |
"fleurs_tag":null,
|
| 5134 |
-
"commonvoice_hours":
|
| 5135 |
-
"commonvoice_locale":"
|
| 5136 |
"in_benchmark":false
|
| 5137 |
},
|
| 5138 |
{
|
| 5139 |
-
"bcp_47":"
|
| 5140 |
"speakers":332940,
|
| 5141 |
-
"language_name":"
|
| 5142 |
-
"autonym":"
|
| 5143 |
"family":"Atlantic-Congo",
|
| 5144 |
"flores_path":null,
|
| 5145 |
"fleurs_tag":null,
|
| 5146 |
-
"commonvoice_hours":
|
| 5147 |
-
"commonvoice_locale":"
|
| 5148 |
"in_benchmark":false
|
| 5149 |
},
|
| 5150 |
{
|
|
@@ -5232,11 +5232,11 @@
|
|
| 5232 |
"in_benchmark":false
|
| 5233 |
},
|
| 5234 |
{
|
| 5235 |
-
"bcp_47":"
|
| 5236 |
"speakers":305001,
|
| 5237 |
-
"language_name":"
|
| 5238 |
-
"autonym":"
|
| 5239 |
-
"family":"
|
| 5240 |
"flores_path":null,
|
| 5241 |
"fleurs_tag":null,
|
| 5242 |
"commonvoice_hours":null,
|
|
@@ -5244,11 +5244,11 @@
|
|
| 5244 |
"in_benchmark":false
|
| 5245 |
},
|
| 5246 |
{
|
| 5247 |
-
"bcp_47":"
|
| 5248 |
"speakers":305001,
|
| 5249 |
-
"language_name":"
|
| 5250 |
-
"autonym":"
|
| 5251 |
-
"family":"
|
| 5252 |
"flores_path":null,
|
| 5253 |
"fleurs_tag":null,
|
| 5254 |
"commonvoice_hours":null,
|
|
@@ -5388,10 +5388,10 @@
|
|
| 5388 |
"in_benchmark":false
|
| 5389 |
},
|
| 5390 |
{
|
| 5391 |
-
"bcp_47":"
|
| 5392 |
"speakers":264864,
|
| 5393 |
-
"language_name":"
|
| 5394 |
-
"autonym":"
|
| 5395 |
"family":"Tai-Kadai",
|
| 5396 |
"flores_path":null,
|
| 5397 |
"fleurs_tag":null,
|
|
@@ -5400,10 +5400,10 @@
|
|
| 5400 |
"in_benchmark":false
|
| 5401 |
},
|
| 5402 |
{
|
| 5403 |
-
"bcp_47":"
|
| 5404 |
"speakers":264864,
|
| 5405 |
-
"language_name":"
|
| 5406 |
-
"autonym":"
|
| 5407 |
"family":"Tai-Kadai",
|
| 5408 |
"flores_path":null,
|
| 5409 |
"fleurs_tag":null,
|
|
@@ -5508,10 +5508,10 @@
|
|
| 5508 |
"in_benchmark":true
|
| 5509 |
},
|
| 5510 |
{
|
| 5511 |
-
"bcp_47":"
|
| 5512 |
"speakers":245664,
|
| 5513 |
-
"language_name":"
|
| 5514 |
-
"autonym":"
|
| 5515 |
"family":"Austronesian",
|
| 5516 |
"flores_path":null,
|
| 5517 |
"fleurs_tag":null,
|
|
@@ -5520,10 +5520,10 @@
|
|
| 5520 |
"in_benchmark":false
|
| 5521 |
},
|
| 5522 |
{
|
| 5523 |
-
"bcp_47":"
|
| 5524 |
"speakers":245664,
|
| 5525 |
-
"language_name":"
|
| 5526 |
-
"autonym":"
|
| 5527 |
"family":"Austronesian",
|
| 5528 |
"flores_path":null,
|
| 5529 |
"fleurs_tag":null,
|
|
@@ -5904,10 +5904,10 @@
|
|
| 5904 |
"in_benchmark":false
|
| 5905 |
},
|
| 5906 |
{
|
| 5907 |
-
"bcp_47":"
|
| 5908 |
"speakers":149823,
|
| 5909 |
-
"language_name":"
|
| 5910 |
-
"autonym":"
|
| 5911 |
"family":"Atlantic-Congo",
|
| 5912 |
"flores_path":null,
|
| 5913 |
"fleurs_tag":null,
|
|
@@ -5916,10 +5916,10 @@
|
|
| 5916 |
"in_benchmark":false
|
| 5917 |
},
|
| 5918 |
{
|
| 5919 |
-
"bcp_47":"
|
| 5920 |
"speakers":149823,
|
| 5921 |
-
"language_name":"
|
| 5922 |
-
"autonym":"
|
| 5923 |
"family":"Atlantic-Congo",
|
| 5924 |
"flores_path":null,
|
| 5925 |
"fleurs_tag":null,
|
|
@@ -6367,7 +6367,7 @@
|
|
| 6367 |
"family":"Indo-European",
|
| 6368 |
"flores_path":null,
|
| 6369 |
"fleurs_tag":null,
|
| 6370 |
-
"commonvoice_hours":
|
| 6371 |
"commonvoice_locale":"btv",
|
| 6372 |
"in_benchmark":false
|
| 6373 |
},
|
|
@@ -7272,11 +7272,11 @@
|
|
| 7272 |
"in_benchmark":false
|
| 7273 |
},
|
| 7274 |
{
|
| 7275 |
-
"bcp_47":"
|
| 7276 |
"speakers":7970,
|
| 7277 |
-
"language_name":"
|
| 7278 |
-
"autonym":"
|
| 7279 |
-
"family":"
|
| 7280 |
"flores_path":null,
|
| 7281 |
"fleurs_tag":null,
|
| 7282 |
"commonvoice_hours":null,
|
|
@@ -7284,11 +7284,11 @@
|
|
| 7284 |
"in_benchmark":false
|
| 7285 |
},
|
| 7286 |
{
|
| 7287 |
-
"bcp_47":"
|
| 7288 |
"speakers":7970,
|
| 7289 |
-
"language_name":"
|
| 7290 |
-
"autonym":"
|
| 7291 |
-
"family":"
|
| 7292 |
"flores_path":null,
|
| 7293 |
"fleurs_tag":null,
|
| 7294 |
"commonvoice_hours":null,
|
|
@@ -7836,11 +7836,11 @@
|
|
| 7836 |
"in_benchmark":false
|
| 7837 |
},
|
| 7838 |
{
|
| 7839 |
-
"bcp_47":"
|
| 7840 |
"speakers":377,
|
| 7841 |
-
"language_name":"
|
| 7842 |
-
"autonym":"
|
| 7843 |
-
"family":"
|
| 7844 |
"flores_path":null,
|
| 7845 |
"fleurs_tag":null,
|
| 7846 |
"commonvoice_hours":null,
|
|
@@ -7848,11 +7848,11 @@
|
|
| 7848 |
"in_benchmark":false
|
| 7849 |
},
|
| 7850 |
{
|
| 7851 |
-
"bcp_47":"
|
| 7852 |
"speakers":377,
|
| 7853 |
-
"language_name":"
|
| 7854 |
-
"autonym":"
|
| 7855 |
-
"family":"
|
| 7856 |
"flores_path":null,
|
| 7857 |
"fleurs_tag":null,
|
| 7858 |
"commonvoice_hours":null,
|
|
@@ -7968,11 +7968,11 @@
|
|
| 7968 |
"in_benchmark":false
|
| 7969 |
},
|
| 7970 |
{
|
| 7971 |
-
"bcp_47":"
|
| 7972 |
"speakers":0,
|
| 7973 |
-
"language_name":"
|
| 7974 |
-
"autonym":"
|
| 7975 |
-
"family":"
|
| 7976 |
"flores_path":null,
|
| 7977 |
"fleurs_tag":null,
|
| 7978 |
"commonvoice_hours":null,
|
|
@@ -7980,11 +7980,11 @@
|
|
| 7980 |
"in_benchmark":false
|
| 7981 |
},
|
| 7982 |
{
|
| 7983 |
-
"bcp_47":"
|
| 7984 |
"speakers":0,
|
| 7985 |
-
"language_name":"
|
| 7986 |
-
"autonym":"
|
| 7987 |
-
"family":"
|
| 7988 |
"flores_path":null,
|
| 7989 |
"fleurs_tag":null,
|
| 7990 |
"commonvoice_hours":null,
|
|
@@ -7992,22 +7992,22 @@
|
|
| 7992 |
"in_benchmark":false
|
| 7993 |
},
|
| 7994 |
{
|
| 7995 |
-
"bcp_47":"
|
| 7996 |
"speakers":0,
|
| 7997 |
-
"language_name":"
|
| 7998 |
-
"autonym":"
|
| 7999 |
-
"family":"
|
| 8000 |
"flores_path":null,
|
| 8001 |
"fleurs_tag":null,
|
| 8002 |
-
"commonvoice_hours":
|
| 8003 |
-
"commonvoice_locale":
|
| 8004 |
"in_benchmark":false
|
| 8005 |
},
|
| 8006 |
{
|
| 8007 |
-
"bcp_47":"
|
| 8008 |
"speakers":0,
|
| 8009 |
-
"language_name":"
|
| 8010 |
-
"autonym":"
|
| 8011 |
"family":"Indo-European",
|
| 8012 |
"flores_path":null,
|
| 8013 |
"fleurs_tag":null,
|
|
@@ -8016,23 +8016,23 @@
|
|
| 8016 |
"in_benchmark":false
|
| 8017 |
},
|
| 8018 |
{
|
| 8019 |
-
"bcp_47":"
|
| 8020 |
"speakers":0,
|
| 8021 |
-
"language_name":"
|
| 8022 |
-
"autonym":"
|
| 8023 |
-
"family":"
|
| 8024 |
"flores_path":null,
|
| 8025 |
"fleurs_tag":null,
|
| 8026 |
-
"commonvoice_hours":
|
| 8027 |
-
"commonvoice_locale":
|
| 8028 |
"in_benchmark":false
|
| 8029 |
},
|
| 8030 |
{
|
| 8031 |
-
"bcp_47":"
|
| 8032 |
"speakers":0,
|
| 8033 |
-
"language_name":"
|
| 8034 |
-
"autonym":"
|
| 8035 |
-
"family":"
|
| 8036 |
"flores_path":null,
|
| 8037 |
"fleurs_tag":null,
|
| 8038 |
"commonvoice_hours":null,
|
|
@@ -8040,23 +8040,23 @@
|
|
| 8040 |
"in_benchmark":false
|
| 8041 |
},
|
| 8042 |
{
|
| 8043 |
-
"bcp_47":"
|
| 8044 |
"speakers":0,
|
| 8045 |
-
"language_name":"
|
| 8046 |
-
"autonym":"
|
| 8047 |
-
"family":"
|
| 8048 |
"flores_path":null,
|
| 8049 |
"fleurs_tag":null,
|
| 8050 |
-
"commonvoice_hours":
|
| 8051 |
-
"commonvoice_locale":
|
| 8052 |
"in_benchmark":false
|
| 8053 |
},
|
| 8054 |
{
|
| 8055 |
-
"bcp_47":"
|
| 8056 |
"speakers":0,
|
| 8057 |
-
"language_name":"
|
| 8058 |
-
"autonym":"
|
| 8059 |
-
"family":"
|
| 8060 |
"flores_path":null,
|
| 8061 |
"fleurs_tag":null,
|
| 8062 |
"commonvoice_hours":null,
|
|
@@ -8064,10 +8064,10 @@
|
|
| 8064 |
"in_benchmark":false
|
| 8065 |
},
|
| 8066 |
{
|
| 8067 |
-
"bcp_47":"
|
| 8068 |
"speakers":0,
|
| 8069 |
-
"language_name":"
|
| 8070 |
-
"autonym":"
|
| 8071 |
"family":"Indo-European",
|
| 8072 |
"flores_path":null,
|
| 8073 |
"fleurs_tag":null,
|
|
@@ -8076,11 +8076,11 @@
|
|
| 8076 |
"in_benchmark":false
|
| 8077 |
},
|
| 8078 |
{
|
| 8079 |
-
"bcp_47":"
|
| 8080 |
"speakers":0,
|
| 8081 |
-
"language_name":"
|
| 8082 |
-
"autonym":"
|
| 8083 |
-
"family":"
|
| 8084 |
"flores_path":null,
|
| 8085 |
"fleurs_tag":null,
|
| 8086 |
"commonvoice_hours":null,
|
|
@@ -8088,11 +8088,11 @@
|
|
| 8088 |
"in_benchmark":false
|
| 8089 |
},
|
| 8090 |
{
|
| 8091 |
-
"bcp_47":"
|
| 8092 |
"speakers":0,
|
| 8093 |
-
"language_name":"
|
| 8094 |
-
"autonym":"
|
| 8095 |
-
"family":"
|
| 8096 |
"flores_path":null,
|
| 8097 |
"fleurs_tag":null,
|
| 8098 |
"commonvoice_hours":null,
|
|
@@ -8112,23 +8112,23 @@
|
|
| 8112 |
"in_benchmark":false
|
| 8113 |
},
|
| 8114 |
{
|
| 8115 |
-
"bcp_47":"
|
| 8116 |
"speakers":0,
|
| 8117 |
-
"language_name":"
|
| 8118 |
-
"autonym":"
|
| 8119 |
-
"family":"
|
| 8120 |
"flores_path":null,
|
| 8121 |
"fleurs_tag":null,
|
| 8122 |
-
"commonvoice_hours":
|
| 8123 |
-
"commonvoice_locale":
|
| 8124 |
"in_benchmark":false
|
| 8125 |
},
|
| 8126 |
{
|
| 8127 |
-
"bcp_47":"
|
| 8128 |
"speakers":0,
|
| 8129 |
-
"language_name":"
|
| 8130 |
-
"autonym":"
|
| 8131 |
-
"family":"
|
| 8132 |
"flores_path":null,
|
| 8133 |
"fleurs_tag":null,
|
| 8134 |
"commonvoice_hours":null,
|
|
|
|
| 485 |
"language_name":"North Levantine Arabic",
|
| 486 |
"autonym":"العامية",
|
| 487 |
"family":"Afro-Asiatic",
|
| 488 |
+
"flores_path":"apc_Arab_nort3139",
|
| 489 |
"fleurs_tag":null,
|
| 490 |
"commonvoice_hours":null,
|
| 491 |
"commonvoice_locale":null,
|
|
|
|
| 876 |
"in_benchmark":true
|
| 877 |
},
|
| 878 |
{
|
| 879 |
+
"bcp_47":"bgc",
|
| 880 |
"speakers":15913080,
|
| 881 |
+
"language_name":"Haryanvi",
|
| 882 |
+
"autonym":"हरियाणवी",
|
| 883 |
"family":"Indo-European",
|
| 884 |
"flores_path":null,
|
| 885 |
"fleurs_tag":null,
|
|
|
|
| 888 |
"in_benchmark":false
|
| 889 |
},
|
| 890 |
{
|
| 891 |
+
"bcp_47":"mwr",
|
| 892 |
"speakers":15913080,
|
| 893 |
+
"language_name":"Marwari",
|
| 894 |
+
"autonym":"Marwari",
|
| 895 |
"family":"Indo-European",
|
| 896 |
"flores_path":null,
|
| 897 |
"fleurs_tag":null,
|
|
|
|
| 1073 |
"language_name":"Akan",
|
| 1074 |
"autonym":"Akan",
|
| 1075 |
"family":"Atlantic-Congo",
|
| 1076 |
+
"flores_path":"twi_Latn_akua1239",
|
| 1077 |
"fleurs_tag":null,
|
| 1078 |
"commonvoice_hours":0.2,
|
| 1079 |
"commonvoice_locale":"tw",
|
|
|
|
| 1171 |
"family":"Afro-Asiatic",
|
| 1172 |
"flores_path":"tir_Ethi",
|
| 1173 |
"fleurs_tag":null,
|
| 1174 |
+
"commonvoice_hours":0.1,
|
| 1175 |
"commonvoice_locale":"ti",
|
| 1176 |
"in_benchmark":true
|
| 1177 |
},
|
|
|
|
| 1195 |
"family":"Atlantic-Congo",
|
| 1196 |
"flores_path":"lua_Latn",
|
| 1197 |
"fleurs_tag":null,
|
| 1198 |
+
"commonvoice_hours":2.2,
|
| 1199 |
"commonvoice_locale":"lua",
|
| 1200 |
"in_benchmark":true
|
| 1201 |
},
|
|
|
|
| 1955 |
"commonvoice_locale":"gom",
|
| 1956 |
"in_benchmark":true
|
| 1957 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1958 |
{
|
| 1959 |
"bcp_47":"kam",
|
| 1960 |
"speakers":4068120,
|
|
|
|
| 1967 |
"commonvoice_locale":"kam",
|
| 1968 |
"in_benchmark":true
|
| 1969 |
},
|
| 1970 |
+
{
|
| 1971 |
+
"bcp_47":"kln",
|
| 1972 |
+
"speakers":4068120,
|
| 1973 |
+
"language_name":"Kalenjin",
|
| 1974 |
+
"autonym":"Kalenjin",
|
| 1975 |
+
"family":"Nilotic",
|
| 1976 |
+
"flores_path":null,
|
| 1977 |
+
"fleurs_tag":null,
|
| 1978 |
+
"commonvoice_hours":43.0,
|
| 1979 |
+
"commonvoice_locale":"kln",
|
| 1980 |
+
"in_benchmark":false
|
| 1981 |
+
},
|
| 1982 |
{
|
| 1983 |
"bcp_47":"bjn",
|
| 1984 |
"speakers":4010288,
|
|
|
|
| 2124 |
"in_benchmark":true
|
| 2125 |
},
|
| 2126 |
{
|
| 2127 |
+
"bcp_47":"lmn",
|
| 2128 |
"speakers":3580443,
|
| 2129 |
+
"language_name":"Lambadi",
|
| 2130 |
+
"autonym":"Lambadi",
|
| 2131 |
"family":"Indo-European",
|
| 2132 |
"flores_path":null,
|
| 2133 |
"fleurs_tag":null,
|
|
|
|
| 2136 |
"in_benchmark":false
|
| 2137 |
},
|
| 2138 |
{
|
| 2139 |
+
"bcp_47":"gbm",
|
| 2140 |
"speakers":3580443,
|
| 2141 |
+
"language_name":"Garhwali",
|
| 2142 |
+
"autonym":"Garhwali",
|
| 2143 |
"family":"Indo-European",
|
| 2144 |
"flores_path":null,
|
| 2145 |
"fleurs_tag":null,
|
|
|
|
| 2352 |
"in_benchmark":true
|
| 2353 |
},
|
| 2354 |
{
|
| 2355 |
+
"bcp_47":"ibb",
|
| 2356 |
"speakers":2996392,
|
| 2357 |
+
"language_name":"Ibibio",
|
| 2358 |
+
"autonym":"Ibibio",
|
| 2359 |
"family":"Atlantic-Congo",
|
| 2360 |
"flores_path":null,
|
| 2361 |
"fleurs_tag":null,
|
| 2362 |
+
"commonvoice_hours":11.0,
|
| 2363 |
+
"commonvoice_locale":"ibb",
|
| 2364 |
"in_benchmark":false
|
| 2365 |
},
|
| 2366 |
{
|
| 2367 |
+
"bcp_47":"efi",
|
| 2368 |
"speakers":2996392,
|
| 2369 |
+
"language_name":"Efik",
|
| 2370 |
+
"autonym":"Efik",
|
| 2371 |
"family":"Atlantic-Congo",
|
| 2372 |
"flores_path":null,
|
| 2373 |
"fleurs_tag":null,
|
| 2374 |
+
"commonvoice_hours":null,
|
| 2375 |
+
"commonvoice_locale":null,
|
| 2376 |
"in_benchmark":false
|
| 2377 |
},
|
| 2378 |
{
|
|
|
|
| 2544 |
"in_benchmark":false
|
| 2545 |
},
|
| 2546 |
{
|
| 2547 |
+
"bcp_47":"sck",
|
| 2548 |
"speakers":2386962,
|
| 2549 |
+
"language_name":"Sadri",
|
| 2550 |
+
"autonym":"Sadri",
|
| 2551 |
+
"family":"Indo-European",
|
| 2552 |
"flores_path":null,
|
| 2553 |
"fleurs_tag":null,
|
| 2554 |
"commonvoice_hours":null,
|
|
|
|
| 2556 |
"in_benchmark":false
|
| 2557 |
},
|
| 2558 |
{
|
| 2559 |
+
"bcp_47":"wbq",
|
| 2560 |
"speakers":2386962,
|
| 2561 |
+
"language_name":"Waddar",
|
| 2562 |
+
"autonym":"Waddar",
|
| 2563 |
+
"family":"Dravidian",
|
| 2564 |
"flores_path":null,
|
| 2565 |
"fleurs_tag":null,
|
| 2566 |
"commonvoice_hours":null,
|
|
|
|
| 2724 |
"in_benchmark":false
|
| 2725 |
},
|
| 2726 |
{
|
| 2727 |
+
"bcp_47":"wbr",
|
| 2728 |
"speakers":1989135,
|
| 2729 |
+
"language_name":"Wagdi",
|
| 2730 |
+
"autonym":"Wagdi",
|
| 2731 |
"family":"Indo-European",
|
| 2732 |
"flores_path":null,
|
| 2733 |
"fleurs_tag":null,
|
|
|
|
| 2748 |
"in_benchmark":false
|
| 2749 |
},
|
| 2750 |
{
|
| 2751 |
+
"bcp_47":"khn",
|
| 2752 |
"speakers":1989135,
|
| 2753 |
+
"language_name":"Khandesi",
|
| 2754 |
+
"autonym":"Khandesi",
|
| 2755 |
"family":"Indo-European",
|
| 2756 |
"flores_path":null,
|
| 2757 |
"fleurs_tag":null,
|
|
|
|
| 3535 |
"family":null,
|
| 3536 |
"flores_path":"eus_Latn",
|
| 3537 |
"fleurs_tag":null,
|
| 3538 |
+
"commonvoice_hours":379.0,
|
| 3539 |
"commonvoice_locale":"eu",
|
| 3540 |
"in_benchmark":true
|
| 3541 |
},
|
|
|
|
| 3559 |
"family":"Abkhaz-Adyge",
|
| 3560 |
"flores_path":null,
|
| 3561 |
"fleurs_tag":null,
|
| 3562 |
+
"commonvoice_hours":62.0,
|
| 3563 |
"commonvoice_locale":"kbd",
|
| 3564 |
"in_benchmark":false
|
| 3565 |
},
|
|
|
|
| 3684 |
"in_benchmark":true
|
| 3685 |
},
|
| 3686 |
{
|
| 3687 |
+
"bcp_47":"bez",
|
| 3688 |
"speakers":995398,
|
| 3689 |
+
"language_name":"Bena",
|
| 3690 |
+
"autonym":"Hibena",
|
| 3691 |
"family":"Atlantic-Congo",
|
| 3692 |
"flores_path":null,
|
| 3693 |
"fleurs_tag":null,
|
|
|
|
| 3696 |
"in_benchmark":false
|
| 3697 |
},
|
| 3698 |
{
|
| 3699 |
+
"bcp_47":"ksb",
|
| 3700 |
"speakers":995398,
|
| 3701 |
+
"language_name":"Shambala",
|
| 3702 |
+
"autonym":"Kishambaa",
|
| 3703 |
"family":"Atlantic-Congo",
|
| 3704 |
"flores_path":null,
|
| 3705 |
"fleurs_tag":null,
|
|
|
|
| 4512 |
"in_benchmark":false
|
| 4513 |
},
|
| 4514 |
{
|
| 4515 |
+
"bcp_47":"nhe",
|
| 4516 |
"speakers":501735,
|
| 4517 |
+
"language_name":"Eastern Huasteca Nahuatl",
|
| 4518 |
+
"autonym":"Eastern Huasteca Nahuatl",
|
| 4519 |
"family":"Uto-Aztecan",
|
| 4520 |
"flores_path":null,
|
| 4521 |
"fleurs_tag":null,
|
| 4522 |
+
"commonvoice_hours":0.0,
|
| 4523 |
+
"commonvoice_locale":"nhe",
|
| 4524 |
"in_benchmark":false
|
| 4525 |
},
|
| 4526 |
{
|
| 4527 |
+
"bcp_47":"nhw",
|
| 4528 |
"speakers":501735,
|
| 4529 |
+
"language_name":"Western Huasteca Nahuatl",
|
| 4530 |
+
"autonym":"Western Huasteca Nahuatl",
|
| 4531 |
"family":"Uto-Aztecan",
|
| 4532 |
"flores_path":null,
|
| 4533 |
"fleurs_tag":null,
|
| 4534 |
+
"commonvoice_hours":null,
|
| 4535 |
+
"commonvoice_locale":null,
|
| 4536 |
"in_benchmark":false
|
| 4537 |
},
|
| 4538 |
{
|
|
|
|
| 4553 |
"language_name":"Kara-Kalpak",
|
| 4554 |
"autonym":"Kara-Kalpak",
|
| 4555 |
"family":"Turkic",
|
| 4556 |
+
"flores_path":"kaa_Latn",
|
| 4557 |
"fleurs_tag":null,
|
| 4558 |
"commonvoice_hours":0.0,
|
| 4559 |
"commonvoice_locale":"kaa",
|
| 4560 |
+
"in_benchmark":true
|
| 4561 |
},
|
| 4562 |
{
|
| 4563 |
"bcp_47":"gju",
|
|
|
|
| 4715 |
"commonvoice_locale":null,
|
| 4716 |
"in_benchmark":false
|
| 4717 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4718 |
{
|
| 4719 |
"bcp_47":"vun",
|
| 4720 |
"speakers":433291,
|
|
|
|
| 4735 |
"family":"Atlantic-Congo",
|
| 4736 |
"flores_path":null,
|
| 4737 |
"fleurs_tag":null,
|
| 4738 |
+
"commonvoice_hours":2.5,
|
| 4739 |
"commonvoice_locale":"rof",
|
| 4740 |
"in_benchmark":false
|
| 4741 |
},
|
| 4742 |
+
{
|
| 4743 |
+
"bcp_47":"jmc",
|
| 4744 |
+
"speakers":433291,
|
| 4745 |
+
"language_name":"Machame",
|
| 4746 |
+
"autonym":"Kimachame",
|
| 4747 |
+
"family":"Atlantic-Congo",
|
| 4748 |
+
"flores_path":null,
|
| 4749 |
+
"fleurs_tag":null,
|
| 4750 |
+
"commonvoice_hours":null,
|
| 4751 |
+
"commonvoice_locale":null,
|
| 4752 |
+
"in_benchmark":false
|
| 4753 |
+
},
|
| 4754 |
{
|
| 4755 |
"bcp_47":"kjg",
|
| 4756 |
"speakers":431949,
|
|
|
|
| 5124 |
"in_benchmark":false
|
| 5125 |
},
|
| 5126 |
{
|
| 5127 |
+
"bcp_47":"bax",
|
| 5128 |
"speakers":332940,
|
| 5129 |
+
"language_name":"Bamun",
|
| 5130 |
+
"autonym":"Bamun",
|
| 5131 |
"family":"Atlantic-Congo",
|
| 5132 |
"flores_path":null,
|
| 5133 |
"fleurs_tag":null,
|
| 5134 |
+
"commonvoice_hours":11.0,
|
| 5135 |
+
"commonvoice_locale":"bax",
|
| 5136 |
"in_benchmark":false
|
| 5137 |
},
|
| 5138 |
{
|
| 5139 |
+
"bcp_47":"bas",
|
| 5140 |
"speakers":332940,
|
| 5141 |
+
"language_name":"Basaa",
|
| 5142 |
+
"autonym":"Ɓàsàa",
|
| 5143 |
"family":"Atlantic-Congo",
|
| 5144 |
"flores_path":null,
|
| 5145 |
"fleurs_tag":null,
|
| 5146 |
+
"commonvoice_hours":12.0,
|
| 5147 |
+
"commonvoice_locale":"bas",
|
| 5148 |
"in_benchmark":false
|
| 5149 |
},
|
| 5150 |
{
|
|
|
|
| 5232 |
"in_benchmark":false
|
| 5233 |
},
|
| 5234 |
{
|
| 5235 |
+
"bcp_47":"njo",
|
| 5236 |
"speakers":305001,
|
| 5237 |
+
"language_name":"Ao Naga",
|
| 5238 |
+
"autonym":"Ao Naga",
|
| 5239 |
+
"family":"Sino-Tibetan",
|
| 5240 |
"flores_path":null,
|
| 5241 |
"fleurs_tag":null,
|
| 5242 |
"commonvoice_hours":null,
|
|
|
|
| 5244 |
"in_benchmark":false
|
| 5245 |
},
|
| 5246 |
{
|
| 5247 |
+
"bcp_47":"bfq",
|
| 5248 |
"speakers":305001,
|
| 5249 |
+
"language_name":"Badaga",
|
| 5250 |
+
"autonym":"Badaga",
|
| 5251 |
+
"family":"Dravidian",
|
| 5252 |
"flores_path":null,
|
| 5253 |
"fleurs_tag":null,
|
| 5254 |
"commonvoice_hours":null,
|
|
|
|
| 5388 |
"in_benchmark":false
|
| 5389 |
},
|
| 5390 |
{
|
| 5391 |
+
"bcp_47":"khb",
|
| 5392 |
"speakers":264864,
|
| 5393 |
+
"language_name":"Lü",
|
| 5394 |
+
"autonym":"Lü",
|
| 5395 |
"family":"Tai-Kadai",
|
| 5396 |
"flores_path":null,
|
| 5397 |
"fleurs_tag":null,
|
|
|
|
| 5400 |
"in_benchmark":false
|
| 5401 |
},
|
| 5402 |
{
|
| 5403 |
+
"bcp_47":"tdd",
|
| 5404 |
"speakers":264864,
|
| 5405 |
+
"language_name":"Tai Nüa",
|
| 5406 |
+
"autonym":"Tai Nüa",
|
| 5407 |
"family":"Tai-Kadai",
|
| 5408 |
"flores_path":null,
|
| 5409 |
"fleurs_tag":null,
|
|
|
|
| 5508 |
"in_benchmark":true
|
| 5509 |
},
|
| 5510 |
{
|
| 5511 |
+
"bcp_47":"mdr",
|
| 5512 |
"speakers":245664,
|
| 5513 |
+
"language_name":"Mandar",
|
| 5514 |
+
"autonym":"Mandar",
|
| 5515 |
"family":"Austronesian",
|
| 5516 |
"flores_path":null,
|
| 5517 |
"fleurs_tag":null,
|
|
|
|
| 5520 |
"in_benchmark":false
|
| 5521 |
},
|
| 5522 |
{
|
| 5523 |
+
"bcp_47":"sxn",
|
| 5524 |
"speakers":245664,
|
| 5525 |
+
"language_name":"Sangir",
|
| 5526 |
+
"autonym":"Sangir",
|
| 5527 |
"family":"Austronesian",
|
| 5528 |
"flores_path":null,
|
| 5529 |
"fleurs_tag":null,
|
|
|
|
| 5904 |
"in_benchmark":false
|
| 5905 |
},
|
| 5906 |
{
|
| 5907 |
+
"bcp_47":"kkj",
|
| 5908 |
"speakers":149823,
|
| 5909 |
+
"language_name":"Kako",
|
| 5910 |
+
"autonym":"Kakɔ",
|
| 5911 |
"family":"Atlantic-Congo",
|
| 5912 |
"flores_path":null,
|
| 5913 |
"fleurs_tag":null,
|
|
|
|
| 5916 |
"in_benchmark":false
|
| 5917 |
},
|
| 5918 |
{
|
| 5919 |
+
"bcp_47":"bss",
|
| 5920 |
"speakers":149823,
|
| 5921 |
+
"language_name":"Akoose",
|
| 5922 |
+
"autonym":"Akoose",
|
| 5923 |
"family":"Atlantic-Congo",
|
| 5924 |
"flores_path":null,
|
| 5925 |
"fleurs_tag":null,
|
|
|
|
| 6367 |
"family":"Indo-European",
|
| 6368 |
"flores_path":null,
|
| 6369 |
"fleurs_tag":null,
|
| 6370 |
+
"commonvoice_hours":4.6,
|
| 6371 |
"commonvoice_locale":"btv",
|
| 6372 |
"in_benchmark":false
|
| 6373 |
},
|
|
|
|
| 7272 |
"in_benchmark":false
|
| 7273 |
},
|
| 7274 |
{
|
| 7275 |
+
"bcp_47":"twq",
|
| 7276 |
"speakers":7970,
|
| 7277 |
+
"language_name":"Tasawaq",
|
| 7278 |
+
"autonym":"Tasawaq Senni",
|
| 7279 |
+
"family":"Songhay",
|
| 7280 |
"flores_path":null,
|
| 7281 |
"fleurs_tag":null,
|
| 7282 |
"commonvoice_hours":null,
|
|
|
|
| 7284 |
"in_benchmark":false
|
| 7285 |
},
|
| 7286 |
{
|
| 7287 |
+
"bcp_47":"bku",
|
| 7288 |
"speakers":7970,
|
| 7289 |
+
"language_name":"Buhid",
|
| 7290 |
+
"autonym":"Buhid",
|
| 7291 |
+
"family":"Austronesian",
|
| 7292 |
"flores_path":null,
|
| 7293 |
"fleurs_tag":null,
|
| 7294 |
"commonvoice_hours":null,
|
|
|
|
| 7836 |
"in_benchmark":false
|
| 7837 |
},
|
| 7838 |
{
|
| 7839 |
+
"bcp_47":"kwk",
|
| 7840 |
"speakers":377,
|
| 7841 |
+
"language_name":"Kwakʼwala",
|
| 7842 |
+
"autonym":"KwakʼWala",
|
| 7843 |
+
"family":"Wakashan",
|
| 7844 |
"flores_path":null,
|
| 7845 |
"fleurs_tag":null,
|
| 7846 |
"commonvoice_hours":null,
|
|
|
|
| 7848 |
"in_benchmark":false
|
| 7849 |
},
|
| 7850 |
{
|
| 7851 |
+
"bcp_47":"crl",
|
| 7852 |
"speakers":377,
|
| 7853 |
+
"language_name":"Northern East Cree",
|
| 7854 |
+
"autonym":"Northern East Cree",
|
| 7855 |
+
"family":"Algic",
|
| 7856 |
"flores_path":null,
|
| 7857 |
"fleurs_tag":null,
|
| 7858 |
"commonvoice_hours":null,
|
|
|
|
| 7968 |
"in_benchmark":false
|
| 7969 |
},
|
| 7970 |
{
|
| 7971 |
+
"bcp_47":"lzh",
|
| 7972 |
"speakers":0,
|
| 7973 |
+
"language_name":"Literary Chinese",
|
| 7974 |
+
"autonym":"Literary Chinese",
|
| 7975 |
+
"family":"Sino-Tibetan",
|
| 7976 |
"flores_path":null,
|
| 7977 |
"fleurs_tag":null,
|
| 7978 |
"commonvoice_hours":null,
|
|
|
|
| 7980 |
"in_benchmark":false
|
| 7981 |
},
|
| 7982 |
{
|
| 7983 |
+
"bcp_47":"io",
|
| 7984 |
"speakers":0,
|
| 7985 |
+
"language_name":"Ido",
|
| 7986 |
+
"autonym":"Ido",
|
| 7987 |
+
"family":"Artificial Language",
|
| 7988 |
"flores_path":null,
|
| 7989 |
"fleurs_tag":null,
|
| 7990 |
"commonvoice_hours":null,
|
|
|
|
| 7992 |
"in_benchmark":false
|
| 7993 |
},
|
| 7994 |
{
|
| 7995 |
+
"bcp_47":"jbo",
|
| 7996 |
"speakers":0,
|
| 7997 |
+
"language_name":"Lojban",
|
| 7998 |
+
"autonym":"La .Lojban.",
|
| 7999 |
+
"family":"Artificial Language",
|
| 8000 |
"flores_path":null,
|
| 8001 |
"fleurs_tag":null,
|
| 8002 |
+
"commonvoice_hours":0.0,
|
| 8003 |
+
"commonvoice_locale":"jbo",
|
| 8004 |
"in_benchmark":false
|
| 8005 |
},
|
| 8006 |
{
|
| 8007 |
+
"bcp_47":"jut",
|
| 8008 |
"speakers":0,
|
| 8009 |
+
"language_name":"Jutish",
|
| 8010 |
+
"autonym":"Jutish",
|
| 8011 |
"family":"Indo-European",
|
| 8012 |
"flores_path":null,
|
| 8013 |
"fleurs_tag":null,
|
|
|
|
| 8016 |
"in_benchmark":false
|
| 8017 |
},
|
| 8018 |
{
|
| 8019 |
+
"bcp_47":"vot",
|
| 8020 |
"speakers":0,
|
| 8021 |
+
"language_name":"Votic",
|
| 8022 |
+
"autonym":"Votic",
|
| 8023 |
+
"family":"Uralic",
|
| 8024 |
"flores_path":null,
|
| 8025 |
"fleurs_tag":null,
|
| 8026 |
+
"commonvoice_hours":0.1,
|
| 8027 |
+
"commonvoice_locale":"vot",
|
| 8028 |
"in_benchmark":false
|
| 8029 |
},
|
| 8030 |
{
|
| 8031 |
+
"bcp_47":"gez",
|
| 8032 |
"speakers":0,
|
| 8033 |
+
"language_name":"Geez",
|
| 8034 |
+
"autonym":"Geez",
|
| 8035 |
+
"family":"Afro-Asiatic",
|
| 8036 |
"flores_path":null,
|
| 8037 |
"fleurs_tag":null,
|
| 8038 |
"commonvoice_hours":null,
|
|
|
|
| 8040 |
"in_benchmark":false
|
| 8041 |
},
|
| 8042 |
{
|
| 8043 |
+
"bcp_47":"osa",
|
| 8044 |
"speakers":0,
|
| 8045 |
+
"language_name":"Osage",
|
| 8046 |
+
"autonym":"𐓏𐓘𐓻𐓘𐓻𐓟",
|
| 8047 |
+
"family":"Siouan",
|
| 8048 |
"flores_path":null,
|
| 8049 |
"fleurs_tag":null,
|
| 8050 |
+
"commonvoice_hours":null,
|
| 8051 |
+
"commonvoice_locale":null,
|
| 8052 |
"in_benchmark":false
|
| 8053 |
},
|
| 8054 |
{
|
| 8055 |
+
"bcp_47":"rgn",
|
| 8056 |
"speakers":0,
|
| 8057 |
+
"language_name":"Romagnol",
|
| 8058 |
+
"autonym":"Romagnol",
|
| 8059 |
+
"family":"Indo-European",
|
| 8060 |
"flores_path":null,
|
| 8061 |
"fleurs_tag":null,
|
| 8062 |
"commonvoice_hours":null,
|
|
|
|
| 8064 |
"in_benchmark":false
|
| 8065 |
},
|
| 8066 |
{
|
| 8067 |
+
"bcp_47":"cu",
|
| 8068 |
"speakers":0,
|
| 8069 |
+
"language_name":"Church Slavic",
|
| 8070 |
+
"autonym":"Church Slavic",
|
| 8071 |
"family":"Indo-European",
|
| 8072 |
"flores_path":null,
|
| 8073 |
"fleurs_tag":null,
|
|
|
|
| 8076 |
"in_benchmark":false
|
| 8077 |
},
|
| 8078 |
{
|
| 8079 |
+
"bcp_47":"sgs",
|
| 8080 |
"speakers":0,
|
| 8081 |
+
"language_name":"Samogitian",
|
| 8082 |
+
"autonym":"Samogitian",
|
| 8083 |
+
"family":"Indo-European",
|
| 8084 |
"flores_path":null,
|
| 8085 |
"fleurs_tag":null,
|
| 8086 |
"commonvoice_hours":null,
|
|
|
|
| 8088 |
"in_benchmark":false
|
| 8089 |
},
|
| 8090 |
{
|
| 8091 |
+
"bcp_47":"ann",
|
| 8092 |
"speakers":0,
|
| 8093 |
+
"language_name":"Obolo",
|
| 8094 |
+
"autonym":"Obolo",
|
| 8095 |
+
"family":"Atlantic-Congo",
|
| 8096 |
"flores_path":null,
|
| 8097 |
"fleurs_tag":null,
|
| 8098 |
"commonvoice_hours":null,
|
|
|
|
| 8112 |
"in_benchmark":false
|
| 8113 |
},
|
| 8114 |
{
|
| 8115 |
+
"bcp_47":"cad",
|
| 8116 |
"speakers":0,
|
| 8117 |
+
"language_name":"Caddo",
|
| 8118 |
+
"autonym":"Caddo",
|
| 8119 |
+
"family":"Caddoan",
|
| 8120 |
"flores_path":null,
|
| 8121 |
"fleurs_tag":null,
|
| 8122 |
+
"commonvoice_hours":null,
|
| 8123 |
+
"commonvoice_locale":null,
|
| 8124 |
"in_benchmark":false
|
| 8125 |
},
|
| 8126 |
{
|
| 8127 |
+
"bcp_47":"pfl",
|
| 8128 |
"speakers":0,
|
| 8129 |
+
"language_name":"Palatine German",
|
| 8130 |
+
"autonym":"Palatine German",
|
| 8131 |
+
"family":"Indo-European",
|
| 8132 |
"flores_path":null,
|
| 8133 |
"fleurs_tag":null,
|
| 8134 |
"commonvoice_hours":null,
|
models.json
CHANGED
|
@@ -32,28 +32,6 @@
|
|
| 32 |
"license":"Mit",
|
| 33 |
"creation_date":1742774400000
|
| 34 |
},
|
| 35 |
-
{
|
| 36 |
-
"id":"deepseek\/deepseek-r1",
|
| 37 |
-
"name":"R1 (free)",
|
| 38 |
-
"provider_name":"DeepSeek",
|
| 39 |
-
"cost":0.0,
|
| 40 |
-
"hf_id":"deepseek-ai\/DeepSeek-R1",
|
| 41 |
-
"size":684531386000.0,
|
| 42 |
-
"type":"Open",
|
| 43 |
-
"license":"Mit",
|
| 44 |
-
"creation_date":1737331200000
|
| 45 |
-
},
|
| 46 |
-
{
|
| 47 |
-
"id":"google\/gemini-2.0-flash-001",
|
| 48 |
-
"name":"Gemini 2.0 Flash",
|
| 49 |
-
"provider_name":"Google",
|
| 50 |
-
"cost":0.4,
|
| 51 |
-
"hf_id":null,
|
| 52 |
-
"size":null,
|
| 53 |
-
"type":"Commercial",
|
| 54 |
-
"license":null,
|
| 55 |
-
"creation_date":1738713600000
|
| 56 |
-
},
|
| 57 |
{
|
| 58 |
"id":"google\/gemini-2.0-flash-lite-001",
|
| 59 |
"name":"Gemini 2.0 Flash Lite",
|
|
@@ -76,28 +54,6 @@
|
|
| 76 |
"license":null,
|
| 77 |
"creation_date":1744848000000
|
| 78 |
},
|
| 79 |
-
{
|
| 80 |
-
"id":"google\/gemini-flash-1.5",
|
| 81 |
-
"name":"Gemini 1.5 Flash ",
|
| 82 |
-
"provider_name":"Google",
|
| 83 |
-
"cost":0.3,
|
| 84 |
-
"hf_id":null,
|
| 85 |
-
"size":null,
|
| 86 |
-
"type":"Commercial",
|
| 87 |
-
"license":null,
|
| 88 |
-
"creation_date":1715644800000
|
| 89 |
-
},
|
| 90 |
-
{
|
| 91 |
-
"id":"google\/gemini-flash-1.5-8b",
|
| 92 |
-
"name":"Gemini 1.5 Flash 8B",
|
| 93 |
-
"provider_name":"Google",
|
| 94 |
-
"cost":0.15,
|
| 95 |
-
"hf_id":null,
|
| 96 |
-
"size":null,
|
| 97 |
-
"type":"Commercial",
|
| 98 |
-
"license":null,
|
| 99 |
-
"creation_date":1727913600000
|
| 100 |
-
},
|
| 101 |
{
|
| 102 |
"id":"google\/gemma-3-27b-it",
|
| 103 |
"name":"Gemma 3 27B (free)",
|
|
@@ -109,17 +65,6 @@
|
|
| 109 |
"license":"Gemma",
|
| 110 |
"creation_date":1740787200000
|
| 111 |
},
|
| 112 |
-
{
|
| 113 |
-
"id":"gryphe\/mythomax-l2-13b",
|
| 114 |
-
"name":"MythoMax 13B",
|
| 115 |
-
"provider_name":"MythoMax 13B",
|
| 116 |
-
"cost":0.07,
|
| 117 |
-
"hf_id":"Gryphe\/MythoMax-L2-13b",
|
| 118 |
-
"size":null,
|
| 119 |
-
"type":"Open",
|
| 120 |
-
"license":"Other",
|
| 121 |
-
"creation_date":1691625600000
|
| 122 |
-
},
|
| 123 |
{
|
| 124 |
"id":"meta-llama\/llama-3-70b-instruct",
|
| 125 |
"name":"Llama 3 70B Instruct",
|
|
|
|
| 32 |
"license":"Mit",
|
| 33 |
"creation_date":1742774400000
|
| 34 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
{
|
| 36 |
"id":"google\/gemini-2.0-flash-lite-001",
|
| 37 |
"name":"Gemini 2.0 Flash Lite",
|
|
|
|
| 54 |
"license":null,
|
| 55 |
"creation_date":1744848000000
|
| 56 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
{
|
| 58 |
"id":"google\/gemma-3-27b-it",
|
| 59 |
"name":"Gemma 3 27B (free)",
|
|
|
|
| 65 |
"license":"Gemma",
|
| 66 |
"creation_date":1740787200000
|
| 67 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
{
|
| 69 |
"id":"meta-llama\/llama-3-70b-instruct",
|
| 70 |
"name":"Llama 3 70B Instruct",
|
results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|