David Pomerenke
commited on
Commit
·
4746aca
1
Parent(s):
678e066
Rough draft of individual language view
Browse files
app.py
CHANGED
|
@@ -798,6 +798,7 @@ with gr.Blocks(title="AI Language Proficiency Benchmark", css=css, head=shortcut
|
|
| 798 |
|
| 799 |
for lang in tqdm(languages[:20], desc="Generating pages"):
|
| 800 |
with demo.route(lang['language_name'], f"/{lang['bcp_47']}"):
|
|
|
|
| 801 |
url = f"hf.co/spaces/datenlaborbmz/ai-language-monitor?lang={lang['bcp_47']}"
|
| 802 |
gr.Markdown(
|
| 803 |
f'''
|
|
@@ -808,6 +809,167 @@ for lang in tqdm(languages[:20], desc="Generating pages"):
|
|
| 808 |
''',
|
| 809 |
sanitize_html=False
|
| 810 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 811 |
|
| 812 |
|
| 813 |
demo.launch()
|
|
|
|
| 798 |
|
| 799 |
for lang in tqdm(languages[:20], desc="Generating pages"):
|
| 800 |
with demo.route(lang['language_name'], f"/{lang['bcp_47']}"):
|
| 801 |
+
gr.Button("← Back to Main Dashboard", link="/")
|
| 802 |
url = f"hf.co/spaces/datenlaborbmz/ai-language-monitor?lang={lang['bcp_47']}"
|
| 803 |
gr.Markdown(
|
| 804 |
f'''
|
|
|
|
| 809 |
''',
|
| 810 |
sanitize_html=False
|
| 811 |
)
|
| 812 |
+
|
| 813 |
+
# Language overview section
|
| 814 |
+
with gr.Row():
|
| 815 |
+
with gr.Column(scale=2):
|
| 816 |
+
gr.Markdown(f"""
|
| 817 |
+
## Language Overview
|
| 818 |
+
- **Native name**: {lang.get('native_name', 'N/A')}
|
| 819 |
+
- **Language family**: {lang.get('language_family', 'N/A')}
|
| 820 |
+
- **BCP-47 code**: `{lang['bcp_47']}`
|
| 821 |
+
- **ISO 639-3 code**: `{lang.get('iso_639_3', 'N/A')}`
|
| 822 |
+
- **Number of speakers**: {format_number(lang['speakers'])}
|
| 823 |
+
- **Script**: {lang.get('script', 'N/A')}
|
| 824 |
+
- **CommonVoice hours**: {round(lang.get('commonvoice_hours', 0) or 0)}
|
| 825 |
+
""")
|
| 826 |
+
|
| 827 |
+
# Resource links
|
| 828 |
+
resource_links = []
|
| 829 |
+
if lang.get('commonvoice_locale'):
|
| 830 |
+
resource_links.append(f"[CommonVoice Dataset](https://commonvoice.mozilla.org/{lang['commonvoice_locale']})")
|
| 831 |
+
if lang.get('wikipedia_code'):
|
| 832 |
+
resource_links.append(f"[Wikipedia](https://{lang['wikipedia_code']}.wikipedia.org)")
|
| 833 |
+
if lang.get('bcp_47'):
|
| 834 |
+
resource_links.append(f"[FLORES+ Dataset](https://huggingface.co/datasets/openlanguagedata/flores_plus/viewer/all/{lang['bcp_47']})")
|
| 835 |
+
|
| 836 |
+
if resource_links:
|
| 837 |
+
gr.Markdown("### Resources\n" + "\n".join(resource_links))
|
| 838 |
+
|
| 839 |
+
with gr.Column(scale=3):
|
| 840 |
+
# Create a mini-map showing where the language is spoken
|
| 841 |
+
country_data = {}
|
| 842 |
+
if "population" in lang:
|
| 843 |
+
for country_code, speakers in lang["population"].items():
|
| 844 |
+
try:
|
| 845 |
+
country = pycountry.countries.get(alpha_2=country_code)
|
| 846 |
+
if country:
|
| 847 |
+
country_data[country.alpha_3] = speakers / lang["speakers"]
|
| 848 |
+
except (KeyError, AttributeError):
|
| 849 |
+
continue
|
| 850 |
+
|
| 851 |
+
locations = list(country_data.keys())
|
| 852 |
+
values = list(country_data.values())
|
| 853 |
+
|
| 854 |
+
if locations:
|
| 855 |
+
fig = go.Figure(data=go.Choropleth(
|
| 856 |
+
locations=locations,
|
| 857 |
+
z=values,
|
| 858 |
+
locationmode="ISO-3",
|
| 859 |
+
colorscale="Blues",
|
| 860 |
+
marker_line_color='white',
|
| 861 |
+
marker_line_width=0.5,
|
| 862 |
+
colorbar_title="Speaker %"
|
| 863 |
+
))
|
| 864 |
+
|
| 865 |
+
fig.update_layout(
|
| 866 |
+
title_text=f"Distribution of {lang['language_name']} Speakers",
|
| 867 |
+
geo=dict(
|
| 868 |
+
showframe=False,
|
| 869 |
+
showcoastlines=True,
|
| 870 |
+
projection_type='natural earth'
|
| 871 |
+
),
|
| 872 |
+
height=300,
|
| 873 |
+
margin={"r":0,"t":30,"l":0,"b":0}
|
| 874 |
+
)
|
| 875 |
+
|
| 876 |
+
gr.Plot(value=fig)
|
| 877 |
+
else:
|
| 878 |
+
gr.Markdown("*Geographic data not available*")
|
| 879 |
+
|
| 880 |
+
# Performance metrics section
|
| 881 |
+
gr.Markdown("## AI Model Performance")
|
| 882 |
+
|
| 883 |
+
with gr.Row():
|
| 884 |
+
with gr.Column():
|
| 885 |
+
# Create metrics dashboard for this language
|
| 886 |
+
metrics_data = []
|
| 887 |
+
for metric_key, display_name in [
|
| 888 |
+
("t2t_score", "Overall Text Performance"),
|
| 889 |
+
("mt_bleu", "Translation (BLEU)"),
|
| 890 |
+
("mt_chrf", "Translation (ChrF)"),
|
| 891 |
+
("cls_acc", "Classification"),
|
| 892 |
+
("mlm_chrf", "Masked Language Modeling"),
|
| 893 |
+
("s2t_score", "Overall Speech Performance"),
|
| 894 |
+
("asr_wer", "Speech Recognition (WER)"),
|
| 895 |
+
("asr_chrf", "Speech Recognition (ChrF)")
|
| 896 |
+
]:
|
| 897 |
+
if metric_key in lang and lang[metric_key] is not None:
|
| 898 |
+
value = lang[metric_key]
|
| 899 |
+
color = "green" if value > 0.5 else "orange" if value > 0.25 else "red"
|
| 900 |
+
|
| 901 |
+
# For WER, lower is better, so invert the color logic
|
| 902 |
+
if metric_key == "asr_wer":
|
| 903 |
+
color = "green" if value < 0.3 else "orange" if value < 0.6 else "red"
|
| 904 |
+
|
| 905 |
+
metrics_data.append({
|
| 906 |
+
"Metric": display_name,
|
| 907 |
+
"Value": round(value, 3),
|
| 908 |
+
"Visual": make_colored_bar(value if metric_key != "asr_wer" else 1 - value)
|
| 909 |
+
})
|
| 910 |
+
|
| 911 |
+
if metrics_data:
|
| 912 |
+
gr.DataFrame(
|
| 913 |
+
pd.DataFrame(metrics_data),
|
| 914 |
+
label=f"Performance Metrics for {lang['language_name']}",
|
| 915 |
+
show_search=False
|
| 916 |
+
)
|
| 917 |
+
else:
|
| 918 |
+
gr.Markdown("*No performance metrics available*")
|
| 919 |
+
|
| 920 |
+
# Model comparison table
|
| 921 |
+
gr.Markdown("## Model Comparison")
|
| 922 |
+
|
| 923 |
+
with gr.Row():
|
| 924 |
+
models_data = []
|
| 925 |
+
for score in lang["scores"]:
|
| 926 |
+
if score.get("t2t_score") is not None:
|
| 927 |
+
model_name = score["model"].split("/")[-1]
|
| 928 |
+
models_data.append({
|
| 929 |
+
"Model": model_name,
|
| 930 |
+
"Overall": round(score.get("t2t_score", 0), 3),
|
| 931 |
+
"Translation": round(score.get("mt_chrf", 0), 3),
|
| 932 |
+
"Classification": round(score.get("cls_acc", 0), 3),
|
| 933 |
+
"Lang Model": round(score.get("mlm_chrf", 0), 3),
|
| 934 |
+
"Speech": round(score.get("asr_chrf", 0), 3) if "asr_chrf" in score else "N/A"
|
| 935 |
+
})
|
| 936 |
+
|
| 937 |
+
if models_data:
|
| 938 |
+
df = pd.DataFrame(models_data).sort_values("Overall", ascending=False)
|
| 939 |
+
gr.DataFrame(
|
| 940 |
+
df,
|
| 941 |
+
label=f"Model Performance on {lang['language_name']}",
|
| 942 |
+
show_search=False
|
| 943 |
+
)
|
| 944 |
+
else:
|
| 945 |
+
gr.Markdown("*No model comparison data available*")
|
| 946 |
+
|
| 947 |
+
# Performance comparison with similar languages
|
| 948 |
+
if lang.get("language_family"):
|
| 949 |
+
gr.Markdown("## Comparison with Related Languages")
|
| 950 |
+
|
| 951 |
+
# Find related languages
|
| 952 |
+
related_langs = [l for l in languages if l.get("language_family") == lang["language_family"] and l["t2t_score"] is not None]
|
| 953 |
+
related_langs = sorted(related_langs, key=lambda x: x["t2t_score"], reverse=True)[:10]
|
| 954 |
+
|
| 955 |
+
if len(related_langs) > 1:
|
| 956 |
+
lang_names = [l["language_name"] for l in related_langs]
|
| 957 |
+
t2t_scores = [l["t2t_score"] for l in related_langs]
|
| 958 |
+
|
| 959 |
+
fig = px.bar(
|
| 960 |
+
x=lang_names,
|
| 961 |
+
y=t2t_scores,
|
| 962 |
+
labels={"x": "Language", "y": "Text-to-Text Score"},
|
| 963 |
+
title=f"Performance Across {lang['language_family']} Languages"
|
| 964 |
+
)
|
| 965 |
+
|
| 966 |
+
# Highlight the current language
|
| 967 |
+
for i, name in enumerate(lang_names):
|
| 968 |
+
if name == lang["language_name"]:
|
| 969 |
+
fig.data[0].marker.color = ["lightblue"] * i + ["orange"] + ["lightblue"] * (len(lang_names) - i - 1)
|
| 970 |
+
|
| 971 |
+
fig.update_layout(height=400)
|
| 972 |
+
gr.Plot(value=fig)
|
| 973 |
|
| 974 |
|
| 975 |
demo.launch()
|