David Pomerenke
commited on
Commit
·
e9a19be
1
Parent(s):
040dc35
Separate overall scores for T2T / S2T
Browse files- app.py +60 -37
- evals.py +5 -4
- results.json +0 -0
app.py
CHANGED
|
@@ -8,20 +8,24 @@ import plotly.graph_objects as go
|
|
| 8 |
import pycountry
|
| 9 |
|
| 10 |
with open("results.json") as f:
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
# Global constants for metric mappings
|
| 14 |
-
METRICS =
|
| 15 |
-
|
| 16 |
-
"display_name": "Overall Performance",
|
| 17 |
-
"field_name": "
|
| 18 |
-
"label": "Overall
|
| 19 |
"explanation": """
|
| 20 |
-
**Overall Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
|
| 21 |
Higher scores indicate better overall language capabilities.
|
| 22 |
""",
|
| 23 |
},
|
| 24 |
-
|
| 25 |
"display_name": "Translation (BLEU)",
|
| 26 |
"field_name": "mt_bleu",
|
| 27 |
"label": "BLEU Score",
|
|
@@ -30,7 +34,7 @@ METRICS = {
|
|
| 30 |
It calculates n-gram precision and applies a brevity penalty. Scores range from 0 to 1, with higher values indicating better translation quality.
|
| 31 |
""",
|
| 32 |
},
|
| 33 |
-
|
| 34 |
"display_name": "Translation (ChrF)",
|
| 35 |
"field_name": "mt_chrf",
|
| 36 |
"label": "ChrF Score",
|
|
@@ -40,7 +44,7 @@ METRICS = {
|
|
| 40 |
Higher scores (0-1) indicate better translations.
|
| 41 |
""",
|
| 42 |
},
|
| 43 |
-
|
| 44 |
"display_name": "Classification (Accuracy)",
|
| 45 |
"field_name": "cls_acc",
|
| 46 |
"label": "Classification Accuracy",
|
|
@@ -50,7 +54,7 @@ METRICS = {
|
|
| 50 |
Reported as a percentage where higher values indicate better classification performance.
|
| 51 |
""",
|
| 52 |
},
|
| 53 |
-
|
| 54 |
"display_name": "Masked Language Modeling (ChrF)",
|
| 55 |
"field_name": "mlm_chrf",
|
| 56 |
"label": "MLM ChrF Score",
|
|
@@ -60,7 +64,16 @@ METRICS = {
|
|
| 60 |
between predicted and actual text. Higher scores indicate better language understanding.
|
| 61 |
""",
|
| 62 |
},
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
"display_name": "Automatic Speech Recognition (WER)",
|
| 65 |
"field_name": "asr_wer",
|
| 66 |
"label": "WER",
|
|
@@ -71,7 +84,7 @@ METRICS = {
|
|
| 71 |
Lower scores indicate better performance, with 0 being perfect transcription.
|
| 72 |
""",
|
| 73 |
},
|
| 74 |
-
|
| 75 |
"display_name": "Automatic Speech Recognition ChrF",
|
| 76 |
"field_name": "asr_chrf",
|
| 77 |
"label": "ChrF",
|
|
@@ -80,8 +93,8 @@ METRICS = {
|
|
| 80 |
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
|
| 81 |
Higher scores (0-1) indicate better translations.
|
| 82 |
""",
|
| 83 |
-
},
|
| 84 |
-
|
| 85 |
|
| 86 |
|
| 87 |
def mean(lst):
|
|
@@ -91,7 +104,7 @@ def mean(lst):
|
|
| 91 |
def create_leaderboard_df(metric):
|
| 92 |
# Sort languages by average BLEU to determine resource categories
|
| 93 |
langs_with_score = [
|
| 94 |
-
lang for lang in
|
| 95 |
]
|
| 96 |
sorted_langs = sorted(
|
| 97 |
langs_with_score, key=lambda x: x[metric["field_name"]], reverse=True
|
|
@@ -106,7 +119,7 @@ def create_leaderboard_df(metric):
|
|
| 106 |
|
| 107 |
# Get all model scores with categorization
|
| 108 |
model_scores = {}
|
| 109 |
-
for lang in
|
| 110 |
category = (
|
| 111 |
"High-Resource"
|
| 112 |
if lang["language_name"] in high_resource
|
|
@@ -205,7 +218,7 @@ def create_leaderboard_df(metric):
|
|
| 205 |
|
| 206 |
|
| 207 |
def create_model_comparison_plot(metric):
|
| 208 |
-
top_languages = sorted(
|
| 209 |
|
| 210 |
# Create appropriate title and y-axis label based on metric
|
| 211 |
title = f"{metric['display_name']} by Model and Language"
|
|
@@ -251,14 +264,14 @@ def create_language_stats_df(metric):
|
|
| 251 |
# Create a list to store flattened data
|
| 252 |
flat_data = []
|
| 253 |
|
| 254 |
-
for lang in
|
| 255 |
# Find the best model and its BLEU score
|
| 256 |
best_model = max(
|
| 257 |
-
lang["scores"] or [{"
|
| 258 |
-
key=lambda x: x
|
| 259 |
-
)
|
| 260 |
|
| 261 |
-
model = best_model["model"]
|
| 262 |
model_name = model.split("/")[-1] if model else "N/A"
|
| 263 |
model_link = (
|
| 264 |
f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>"
|
|
@@ -274,9 +287,9 @@ def create_language_stats_df(metric):
|
|
| 274 |
"Language": f"**{lang['language_name']}**",
|
| 275 |
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
|
| 276 |
# "Models Tested": len(lang["scores"]),
|
| 277 |
-
"Overall": round(lang["overall_score"], 3)
|
| 278 |
-
if lang["overall_score"] is not None
|
| 279 |
-
else "N/A",
|
| 280 |
"Translation": round(lang["mt_bleu"], 3)
|
| 281 |
if lang["mt_bleu"] is not None
|
| 282 |
else "N/A",
|
|
@@ -286,9 +299,7 @@ def create_language_stats_df(metric):
|
|
| 286 |
"MLM": round(lang["mlm_chrf"], 3)
|
| 287 |
if lang["mlm_chrf"] is not None
|
| 288 |
else "N/A",
|
| 289 |
-
"ASR": round(lang["asr_wer"], 3)
|
| 290 |
-
if lang["asr_wer"] is not None
|
| 291 |
-
else "N/A",
|
| 292 |
"Best Model": model_link,
|
| 293 |
"CommonVoice Hours": commonvoice_link,
|
| 294 |
}
|
|
@@ -296,9 +307,22 @@ def create_language_stats_df(metric):
|
|
| 296 |
|
| 297 |
df = pd.DataFrame(flat_data)
|
| 298 |
return gr.DataFrame(
|
| 299 |
-
value=df,
|
| 300 |
label="Language Results",
|
| 301 |
show_search="search",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
datatype=[
|
| 303 |
"markdown", # Language
|
| 304 |
"number", # Speakers
|
|
@@ -316,7 +340,7 @@ def create_language_stats_df(metric):
|
|
| 316 |
|
| 317 |
def create_scatter_plot(metric):
|
| 318 |
# Filter results to include only languages with sufficient speakers
|
| 319 |
-
filtered_results = [lang for lang in
|
| 320 |
|
| 321 |
# Create a list to store data for the scatter plot
|
| 322 |
scatter_data = []
|
|
@@ -434,7 +458,7 @@ def create_world_map(metric):
|
|
| 434 |
# Collect all country data
|
| 435 |
population_data = get_population_data()
|
| 436 |
country_data = {}
|
| 437 |
-
for lang in
|
| 438 |
# Skip languages without the required data
|
| 439 |
if "population" not in lang or lang[metric["field_name"]] is None:
|
| 440 |
continue
|
|
@@ -585,10 +609,10 @@ def create_metric_explanation(metric):
|
|
| 585 |
with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
| 586 |
gr.Markdown("# AI Language Proficiency Benchmark")
|
| 587 |
gr.Markdown("Comparing language proficiency across different models and languages.")
|
| 588 |
-
start_metric = METRICS[
|
| 589 |
|
| 590 |
metric = gr.Dropdown(
|
| 591 |
-
choices=[metric_info["display_name"] for metric_info in METRICS
|
| 592 |
value=start_metric["display_name"],
|
| 593 |
label="Select Metric",
|
| 594 |
interactive=True,
|
|
@@ -596,7 +620,7 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
|
| 596 |
metric_explanation = create_metric_explanation(start_metric)
|
| 597 |
|
| 598 |
gr.Markdown("## Model Comparison")
|
| 599 |
-
create_leaderboard_df(start_metric)
|
| 600 |
model_comparison_plot = gr.Plot(
|
| 601 |
value=create_model_comparison_plot(start_metric),
|
| 602 |
label="Model Comparison",
|
|
@@ -652,10 +676,9 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
|
| 652 |
)
|
| 653 |
|
| 654 |
def update_component(fn, metric_choice):
|
| 655 |
-
metric = [m for m in METRICS
|
| 656 |
return fn(metric)
|
| 657 |
|
| 658 |
-
|
| 659 |
metric.change(
|
| 660 |
fn=partial(update_component, create_metric_explanation),
|
| 661 |
inputs=metric,
|
|
|
|
| 8 |
import pycountry
|
| 9 |
|
| 10 |
with open("results.json") as f:
|
| 11 |
+
languages = json.load(f)
|
| 12 |
+
|
| 13 |
+
languages_with_scores = [
|
| 14 |
+
lang for lang in languages if lang["t2t_score"] is not None
|
| 15 |
+
]
|
| 16 |
|
| 17 |
# Global constants for metric mappings
|
| 18 |
+
METRICS = [
|
| 19 |
+
{
|
| 20 |
+
"display_name": "Overall Text-to-Text Performance",
|
| 21 |
+
"field_name": "t2t_score",
|
| 22 |
+
"label": "Overall Score",
|
| 23 |
"explanation": """
|
| 24 |
+
**Overall Score for Text-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
|
| 25 |
Higher scores indicate better overall language capabilities.
|
| 26 |
""",
|
| 27 |
},
|
| 28 |
+
{
|
| 29 |
"display_name": "Translation (BLEU)",
|
| 30 |
"field_name": "mt_bleu",
|
| 31 |
"label": "BLEU Score",
|
|
|
|
| 34 |
It calculates n-gram precision and applies a brevity penalty. Scores range from 0 to 1, with higher values indicating better translation quality.
|
| 35 |
""",
|
| 36 |
},
|
| 37 |
+
{
|
| 38 |
"display_name": "Translation (ChrF)",
|
| 39 |
"field_name": "mt_chrf",
|
| 40 |
"label": "ChrF Score",
|
|
|
|
| 44 |
Higher scores (0-1) indicate better translations.
|
| 45 |
""",
|
| 46 |
},
|
| 47 |
+
{
|
| 48 |
"display_name": "Classification (Accuracy)",
|
| 49 |
"field_name": "cls_acc",
|
| 50 |
"label": "Classification Accuracy",
|
|
|
|
| 54 |
Reported as a percentage where higher values indicate better classification performance.
|
| 55 |
""",
|
| 56 |
},
|
| 57 |
+
{
|
| 58 |
"display_name": "Masked Language Modeling (ChrF)",
|
| 59 |
"field_name": "mlm_chrf",
|
| 60 |
"label": "MLM ChrF Score",
|
|
|
|
| 64 |
between predicted and actual text. Higher scores indicate better language understanding.
|
| 65 |
""",
|
| 66 |
},
|
| 67 |
+
{
|
| 68 |
+
"display_name": "Overall Speech-to-Text Performance",
|
| 69 |
+
"field_name": "s2t_score",
|
| 70 |
+
"label": "Overall Score",
|
| 71 |
+
"explanation": """
|
| 72 |
+
**Overall Score for Speech-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
|
| 73 |
+
Higher scores indicate better overall language capabilities.
|
| 74 |
+
""",
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
"display_name": "Automatic Speech Recognition (WER)",
|
| 78 |
"field_name": "asr_wer",
|
| 79 |
"label": "WER",
|
|
|
|
| 84 |
Lower scores indicate better performance, with 0 being perfect transcription.
|
| 85 |
""",
|
| 86 |
},
|
| 87 |
+
{
|
| 88 |
"display_name": "Automatic Speech Recognition ChrF",
|
| 89 |
"field_name": "asr_chrf",
|
| 90 |
"label": "ChrF",
|
|
|
|
| 93 |
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
|
| 94 |
Higher scores (0-1) indicate better translations.
|
| 95 |
""",
|
| 96 |
+
},
|
| 97 |
+
]
|
| 98 |
|
| 99 |
|
| 100 |
def mean(lst):
|
|
|
|
| 104 |
def create_leaderboard_df(metric):
|
| 105 |
# Sort languages by average BLEU to determine resource categories
|
| 106 |
langs_with_score = [
|
| 107 |
+
lang for lang in languages_with_scores if lang[metric["field_name"]] is not None
|
| 108 |
]
|
| 109 |
sorted_langs = sorted(
|
| 110 |
langs_with_score, key=lambda x: x[metric["field_name"]], reverse=True
|
|
|
|
| 119 |
|
| 120 |
# Get all model scores with categorization
|
| 121 |
model_scores = {}
|
| 122 |
+
for lang in languages_with_scores:
|
| 123 |
category = (
|
| 124 |
"High-Resource"
|
| 125 |
if lang["language_name"] in high_resource
|
|
|
|
| 218 |
|
| 219 |
|
| 220 |
def create_model_comparison_plot(metric):
|
| 221 |
+
top_languages = sorted(languages_with_scores, key=lambda x: x["speakers"], reverse=True)[:10]
|
| 222 |
|
| 223 |
# Create appropriate title and y-axis label based on metric
|
| 224 |
title = f"{metric['display_name']} by Model and Language"
|
|
|
|
| 264 |
# Create a list to store flattened data
|
| 265 |
flat_data = []
|
| 266 |
|
| 267 |
+
for lang in languages:
|
| 268 |
# Find the best model and its BLEU score
|
| 269 |
best_model = max(
|
| 270 |
+
lang["scores"] or [{"t2t_score": None, "model": None}],
|
| 271 |
+
key=lambda x: x.get("t2t_score", 0),
|
| 272 |
+
) if lang["t2t_score"] is not None else None
|
| 273 |
|
| 274 |
+
model = best_model["model"] if best_model else None
|
| 275 |
model_name = model.split("/")[-1] if model else "N/A"
|
| 276 |
model_link = (
|
| 277 |
f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>"
|
|
|
|
| 287 |
"Language": f"**{lang['language_name']}**",
|
| 288 |
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
|
| 289 |
# "Models Tested": len(lang["scores"]),
|
| 290 |
+
# "Overall": round(lang["overall_score"], 3)
|
| 291 |
+
# if lang["overall_score"] is not None
|
| 292 |
+
# else "N/A",
|
| 293 |
"Translation": round(lang["mt_bleu"], 3)
|
| 294 |
if lang["mt_bleu"] is not None
|
| 295 |
else "N/A",
|
|
|
|
| 299 |
"MLM": round(lang["mlm_chrf"], 3)
|
| 300 |
if lang["mlm_chrf"] is not None
|
| 301 |
else "N/A",
|
| 302 |
+
"ASR": round(lang["asr_wer"], 3) if lang["asr_wer"] is not None else "N/A",
|
|
|
|
|
|
|
| 303 |
"Best Model": model_link,
|
| 304 |
"CommonVoice Hours": commonvoice_link,
|
| 305 |
}
|
|
|
|
| 307 |
|
| 308 |
df = pd.DataFrame(flat_data)
|
| 309 |
return gr.DataFrame(
|
| 310 |
+
value=df,
|
| 311 |
label="Language Results",
|
| 312 |
show_search="search",
|
| 313 |
+
pinned_columns=1,
|
| 314 |
+
column_widths=[
|
| 315 |
+
"100px",
|
| 316 |
+
"100px",
|
| 317 |
+
"100px",
|
| 318 |
+
"100px",
|
| 319 |
+
"100px",
|
| 320 |
+
"100px",
|
| 321 |
+
"100px",
|
| 322 |
+
"100px",
|
| 323 |
+
"100px",
|
| 324 |
+
"100px",
|
| 325 |
+
],
|
| 326 |
datatype=[
|
| 327 |
"markdown", # Language
|
| 328 |
"number", # Speakers
|
|
|
|
| 340 |
|
| 341 |
def create_scatter_plot(metric):
|
| 342 |
# Filter results to include only languages with sufficient speakers
|
| 343 |
+
filtered_results = [lang for lang in languages_with_scores if lang["speakers"] >= 10_000]
|
| 344 |
|
| 345 |
# Create a list to store data for the scatter plot
|
| 346 |
scatter_data = []
|
|
|
|
| 458 |
# Collect all country data
|
| 459 |
population_data = get_population_data()
|
| 460 |
country_data = {}
|
| 461 |
+
for lang in languages:
|
| 462 |
# Skip languages without the required data
|
| 463 |
if "population" not in lang or lang[metric["field_name"]] is None:
|
| 464 |
continue
|
|
|
|
| 609 |
with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
| 610 |
gr.Markdown("# AI Language Proficiency Benchmark")
|
| 611 |
gr.Markdown("Comparing language proficiency across different models and languages.")
|
| 612 |
+
start_metric = METRICS[0]
|
| 613 |
|
| 614 |
metric = gr.Dropdown(
|
| 615 |
+
choices=[metric_info["display_name"] for metric_info in METRICS],
|
| 616 |
value=start_metric["display_name"],
|
| 617 |
label="Select Metric",
|
| 618 |
interactive=True,
|
|
|
|
| 620 |
metric_explanation = create_metric_explanation(start_metric)
|
| 621 |
|
| 622 |
gr.Markdown("## Model Comparison")
|
| 623 |
+
# create_leaderboard_df(start_metric)
|
| 624 |
model_comparison_plot = gr.Plot(
|
| 625 |
value=create_model_comparison_plot(start_metric),
|
| 626 |
label="Model Comparison",
|
|
|
|
| 676 |
)
|
| 677 |
|
| 678 |
def update_component(fn, metric_choice):
|
| 679 |
+
metric = [m for m in METRICS if m["display_name"] == metric_choice][0]
|
| 680 |
return fn(metric)
|
| 681 |
|
|
|
|
| 682 |
metric.change(
|
| 683 |
fn=partial(update_component, create_metric_explanation),
|
| 684 |
inputs=metric,
|
evals.py
CHANGED
|
@@ -522,7 +522,7 @@ async def main():
|
|
| 522 |
mt_chrf = mean([s["mt_chrf"] for s in scores_mt])
|
| 523 |
cls_acc = mean([s["true"] == s["pred"] for s in scores_cls])
|
| 524 |
mlm_chrf = mean([s["mlm_chrf"] for s in scores_mlm])
|
| 525 |
-
|
| 526 |
results.append(
|
| 527 |
{
|
| 528 |
"model": model,
|
|
@@ -531,7 +531,7 @@ async def main():
|
|
| 531 |
"mt_chrf": mt_chrf,
|
| 532 |
"cls_acc": cls_acc,
|
| 533 |
"mlm_chrf": mlm_chrf,
|
| 534 |
-
"
|
| 535 |
}
|
| 536 |
)
|
| 537 |
for model in transcription_models:
|
|
@@ -550,7 +550,7 @@ async def main():
|
|
| 550 |
"model_type": "speech-to-text",
|
| 551 |
"asr_wer": asr_wer,
|
| 552 |
"asr_chrf": asr_chrf,
|
| 553 |
-
"
|
| 554 |
}
|
| 555 |
)
|
| 556 |
language_results = {
|
|
@@ -574,7 +574,8 @@ async def main():
|
|
| 574 |
"mlm_chrf",
|
| 575 |
"asr_wer",
|
| 576 |
"asr_chrf",
|
| 577 |
-
"
|
|
|
|
| 578 |
]:
|
| 579 |
language_results[score] = mean(
|
| 580 |
[s[score] for s in results if score in s]
|
|
|
|
| 522 |
mt_chrf = mean([s["mt_chrf"] for s in scores_mt])
|
| 523 |
cls_acc = mean([s["true"] == s["pred"] for s in scores_cls])
|
| 524 |
mlm_chrf = mean([s["mlm_chrf"] for s in scores_mlm])
|
| 525 |
+
t2t_score = (mt_chrf / 100 + cls_acc + mlm_chrf / 100) / 3
|
| 526 |
results.append(
|
| 527 |
{
|
| 528 |
"model": model,
|
|
|
|
| 531 |
"mt_chrf": mt_chrf,
|
| 532 |
"cls_acc": cls_acc,
|
| 533 |
"mlm_chrf": mlm_chrf,
|
| 534 |
+
"t2t_score": t2t_score,
|
| 535 |
}
|
| 536 |
)
|
| 537 |
for model in transcription_models:
|
|
|
|
| 550 |
"model_type": "speech-to-text",
|
| 551 |
"asr_wer": asr_wer,
|
| 552 |
"asr_chrf": asr_chrf,
|
| 553 |
+
"s2t_score": (asr_wer + asr_chrf) / 2,
|
| 554 |
}
|
| 555 |
)
|
| 556 |
language_results = {
|
|
|
|
| 574 |
"mlm_chrf",
|
| 575 |
"asr_wer",
|
| 576 |
"asr_chrf",
|
| 577 |
+
"t2t_score",
|
| 578 |
+
"s2t_score",
|
| 579 |
]:
|
| 580 |
language_results[score] = mean(
|
| 581 |
[s[score] for s in results if score in s]
|
results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|