Spaces:
Running
Running
File size: 56,475 Bytes
e448f50 1ab6346 9d53f89 1ab6346 9d53f89 1ab6346 9d53f89 1ab6346 9d53f89 1ab6346 9d53f89 1ab6346 9d53f89 1ab6346 9d53f89 1ab6346 2158319 1ab6346 0c7e136 e32fdda 2158319 0c7e136 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 e32fdda 9d53f89 2158319 9d53f89 2158319 9d53f89 e32fdda 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 e32fdda 0c7e136 e32fdda 2158319 0c7e136 2158319 9d53f89 0c7e136 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 e32fdda 2158319 9d53f89 e32fdda 9d53f89 e32fdda 9d53f89 0c7e136 2158319 e32fdda 0c7e136 2158319 9d53f89 e32fdda 2158319 9d53f89 e32fdda 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 e32fdda 2158319 3739a4f 2158319 3739a4f 2158319 3739a4f 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 3739a4f 2158319 3739a4f 2158319 e32fdda 9d53f89 e32fdda 9d53f89 e32fdda 2158319 e32fdda 0c7e136 2158319 9d53f89 0c7e136 e32fdda 2158319 e32fdda 2158319 745019f 2158319 e764015 2158319 745019f 2158319 e764015 745019f 2158319 e32fdda 2158319 745019f 2158319 e32fdda 745019f 2158319 9d53f89 80a8c9b 9d53f89 e32fdda 2158319 745019f e32fdda e764015 9d53f89 745019f e764015 2158319 e32fdda 2158319 988dfa3 2158319 9d53f89 e32fdda 988dfa3 9d53f89 e32fdda 2158319 9d53f89 2158319 9d53f89 988dfa3 9d53f89 2158319 e32fdda 2158319 e32fdda 2158319 9d53f89 0c7e136 9d53f89 0c7e136 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 e32fdda 2158319 3739a4f 2158319 3739a4f 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 3739a4f 2158319 3739a4f 0c7e136 9d53f89 988dfa3 2158319 0c7e136 2158319 988dfa3 0c7e136 2158319 e32fdda 2158319 9d53f89 e32fdda 2158319 9d53f89 e32fdda 2158319 9d53f89 2158319 9d53f89 e32fdda 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 e32fdda 2158319 9d53f89 2158319 9d53f89 e32fdda 2158319 3739a4f 2158319 3739a4f 2158319 e32fdda 9d53f89 3739a4f 9d53f89 e32fdda 2158319 e32fdda 9d53f89 2158319 9d53f89 e32fdda 2158319 9d53f89 e32fdda 9d53f89 e32fdda 2158319 9d53f89 e32fdda 9d53f89 2158319 e32fdda 9d53f89 e32fdda 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 e32fdda 2158319 3739a4f 2158319 3739a4f 2158319 3739a4f 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 e32fdda 9d53f89 2158319 9d53f89 e32fdda 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 e32fdda 0c7e136 e32fdda 2158319 0c7e136 2158319 0c7e136 2158319 0c7e136 2158319 e32fdda 0c7e136 2158319 0c7e136 e32fdda 2158319 e32fdda 2158319 0c7e136 e32fdda 2158319 0c7e136 e32fdda 0c7e136 2158319 e32fdda 2158319 e32fdda 2158319 0c7e136 2158319 9d53f89 0c7e136 9d53f89 2158319 9d53f89 2158319 3739a4f 0c7e136 9d53f89 e32fdda 2158319 9d53f89 2158319 e32fdda 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 e32fdda 2158319 9d53f89 2158319 0c7e136 9d53f89 2158319 e32fdda 9d53f89 2158319 e32fdda 2158319 9d53f89 e32fdda 9d53f89 e32fdda 9d53f89 e32fdda 9d53f89 2158319 e32fdda 2158319 9d53f89 0c7e136 e32fdda 9d53f89 e32fdda 2158319 9d53f89 0c7e136 9d53f89 0c7e136 9d53f89 0c7e136 9d53f89 0c7e136 9d53f89 e32fdda 2158319 9d53f89 0c7e136 9d53f89 e32fdda 9d53f89 aed11c8 9d53f89 0c7e136 e32fdda 9d53f89 0c7e136 2158319 9d53f89 e32fdda 9d53f89 e32fdda 2158319 e32fdda 2158319 9d53f89 e32fdda 9d53f89 2158319 9d53f89 2158319 9d53f89 e32fdda 2158319 9d53f89 e32fdda 2158319 9d53f89 e32fdda 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 e32fdda 2158319 9d53f89 e32fdda 2158319 9d53f89 0c7e136 2158319 9d53f89 e32fdda 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 e32fdda 9d53f89 e32fdda 2158319 9d53f89 0c7e136 9d53f89 2158319 9d53f89 2158319 9d53f89 e32fdda 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 0c7e136 e32fdda 9d53f89 e32fdda 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 0c7e136 e32fdda 2158319 e32fdda 2158319 0c7e136 2158319 0c7e136 2158319 0c7e136 e32fdda 0c7e136 e32fdda 2158319 0c7e136 e32fdda 2158319 e32fdda 0c7e136 2158319 e32fdda 0c7e136 2158319 0c7e136 e32fdda 2158319 e32fdda 2158319 0c7e136 e32fdda 2158319 0c7e136 2158319 e32fdda 2158319 e32fdda 2158319 e32fdda 2158319 0c7e136 2158319 e32fdda 2158319 e32fdda 2158319 e32fdda 2158319 e32fdda 2158319 e32fdda 2158319 0c7e136 2158319 0c7e136 2158319 e32fdda 0c7e136 e32fdda 2158319 0c7e136 2158319 0c7e136 e32fdda 2158319 9d53f89 2158319 e32fdda 2158319 9d53f89 e32fdda 9d53f89 e32fdda 9d53f89 3739a4f 2158319 9d53f89 aed11c8 9d53f89 2158319 9d53f89 2158319 aed11c8 2158319 9d53f89 2158319 9d53f89 e32fdda 2158319 e32fdda 9d53f89 e32fdda 9d53f89 e32fdda 9d53f89 e32fdda 988dfa3 9d53f89 2158319 9d53f89 0c7e136 2158319 9d53f89 0c7e136 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 0c7e136 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 e32fdda 2158319 9d53f89 2158319 9d53f89 2158319 9d53f89 0c7e136 9d53f89 988dfa3 9d53f89 988dfa3 2158319 9d53f89 2158319 9d53f89 2158319 988dfa3 9d53f89 988dfa3 9d53f89 2158319 9d53f89 0c7e136 988dfa3 2158319 9d53f89 0c7e136 2158319 0c7e136 9d53f89 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 |
# app.py
import subprocess
import sys
import os
from pathlib import Path
def setup_salt():
"""Clone and setup SALT library like in Colab."""
try:
# Check if salt is already available
import salt.dataset
print("β
SALT library already available")
return True
except ImportError:
pass
print("π₯ Setting up SALT library...")
try:
# Clone SALT repo if not exists
salt_dir = Path("salt")
if not salt_dir.exists():
print("π Cloning SALT repository...")
subprocess.check_call([
"git", "clone", "https://github.com/sunbirdai/salt.git"
])
else:
print("π SALT repository already exists")
# Install SALT requirements
salt_requirements = salt_dir / "requirements.txt"
if salt_requirements.exists():
print("π¦ Installing SALT requirements...")
subprocess.check_call([
sys.executable, "-m", "pip", "install", "-q", "-r", str(salt_requirements)
])
# Add SALT directory to Python path
salt_path = str(salt_dir.absolute())
if salt_path not in sys.path:
sys.path.insert(0, salt_path)
print(f"π Added {salt_path} to Python path")
# Test import
import salt.dataset
print("β
SALT library setup completed successfully")
return True
except Exception as e:
print(f"β Failed to setup SALT: {e}")
return False
# Setup SALT on startup
print("π Starting SALT Translation Leaderboard - Scientific Edition...")
if not setup_salt():
print("β Cannot continue without SALT library")
print("π‘ Please check that git is available and GitHub is accessible")
sys.exit(1)
import gradio as gr
import pandas as pd
import json
import traceback
from datetime import datetime
from typing import Optional, Dict, Tuple, List
# Import our enhanced modules
from src.test_set import (
get_public_test_set_scientific,
get_complete_test_set_scientific,
create_test_set_download_scientific,
validate_test_set_integrity_scientific,
get_track_test_set
)
from src.validation import validate_submission_scientific
from src.evaluation import (
evaluate_predictions_scientific,
generate_scientific_report,
compare_models_statistically
)
from src.leaderboard import (
load_scientific_leaderboard,
add_model_to_scientific_leaderboard,
get_scientific_leaderboard_stats,
get_track_leaderboard,
prepare_track_leaderboard_display,
perform_fair_comparison,
export_scientific_leaderboard
)
from src.plotting import (
create_scientific_leaderboard_plot,
create_language_pair_heatmap_scientific,
create_statistical_comparison_plot,
create_category_comparison_plot,
create_adequacy_analysis_plot,
create_cross_track_analysis_plot,
create_scientific_model_detail_plot
)
from src.utils import (
sanitize_model_name,
get_all_language_pairs,
get_google_comparable_pairs,
get_track_language_pairs,
format_metric_value
)
from config import *
# Global variables for caching
current_leaderboard = None
public_test_set = None
complete_test_set = None
test_set_stats = None
def initialize_scientific_data():
"""Initialize scientific test sets and leaderboard data."""
global public_test_set, complete_test_set, current_leaderboard, test_set_stats
try:
print("π¬ Initializing SALT Translation Leaderboard - Scientific Edition...")
# Load scientific test sets
print("π₯ Loading scientific test sets...")
public_test_set = get_public_test_set_scientific()
complete_test_set = get_complete_test_set_scientific()
# Load scientific leaderboard
print("π Loading scientific leaderboard...")
current_leaderboard = load_scientific_leaderboard()
# Validate test set integrity
print("π Validating test set integrity...")
test_set_stats = validate_test_set_integrity_scientific()
print(f"β
Scientific initialization complete!")
print(f" - Test set: {len(public_test_set):,} samples")
print(f" - Integrity score: {test_set_stats.get('integrity_score', 0):.2f}")
print(f" - Scientific adequacy: {test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')}")
print(f" - Current models: {len(current_leaderboard)}")
return True
except Exception as e:
print(f"β Scientific initialization failed: {e}")
traceback.print_exc()
return False
def download_scientific_test_set() -> Tuple[str, str]:
"""Create downloadable scientific test set and return file path and info."""
try:
global public_test_set
if public_test_set is None:
public_test_set = get_public_test_set_scientific()
# Create download file
download_path, stats = create_test_set_download_scientific()
# Create comprehensive info message
adequacy = stats.get('adequacy_assessment', 'unknown')
adequacy_emoji = {
'excellent': 'π’',
'good': 'π‘',
'fair': 'π ',
'insufficient': 'π΄',
'unknown': 'βͺ'
}.get(adequacy, 'βͺ')
info_msg = f"""
## π₯ SALT Scientific Test Set Downloaded Successfully!
### π¬ Scientific Edition Features:
- **Stratified Sampling**: Ensures representative coverage across domains
- **Statistical Weighting**: Samples weighted by track importance
- **Track Balancing**: Optimized for fair cross-track comparison
- **Adequacy Validation**: {adequacy_emoji} Overall adequacy: **{adequacy.title()}**
### π Dataset Statistics:
- **Total Samples**: {stats['total_samples']:,}
- **Languages**: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))})
- **Google Comparable**: {stats.get('google_comparable_samples', 0):,} samples ({stats.get('google_comparable_rate', 0):.1%})
- **Domains**: {', '.join(stats.get('domains', ['general']))}
### π Track Breakdown:
"""
track_breakdown = stats.get('track_breakdown', {})
for track_name, track_info in track_breakdown.items():
status_emoji = 'β
' if track_info.get('statistical_adequacy', False) else 'β οΈ'
info_msg += f"""
**{status_emoji} {track_info.get('name', track_name)}**:
- Samples: {track_info.get('total_samples', 0):,}
- Language Pairs: {track_info.get('language_pairs', 0)}
- Min Required/Pair: {track_info.get('min_samples_per_pair', 0)}
- Statistical Adequacy: {'Yes' if track_info.get('statistical_adequacy', False) else 'No'}
"""
info_msg += f"""
### π Enhanced File Format:
- `sample_id`: Unique identifier for each sample
- `source_text`: Text to be translated
- `source_language`: Source language code
- `target_language`: Target language code
- `domain`: Content domain (if available)
- `google_comparable`: Whether this pair can be compared with Google Translate
- `tracks_included`: Comma-separated list of tracks that include this sample
- `statistical_weight`: Statistical importance weight (1.0-5.0)
### π¬ Next Steps for Scientific Evaluation:
1. **Run your model** on the source texts to generate translations
2. **Create a predictions file** with columns: `sample_id`, `prediction`
3. **Optional**: Add `category` column to help with model classification
4. **Submit** your predictions using the appropriate track tab
5. **Analyze** results with statistical confidence intervals
### π‘ Tips for Best Results:
- Ensure coverage of all language pairs for chosen track
- Include confidence scores if available
- Provide detailed model description for proper categorization
- Consider submitting to multiple tracks for comprehensive evaluation
"""
return download_path, info_msg
except Exception as e:
error_msg = f"β Error creating scientific test set download: {str(e)}"
return None, error_msg
def validate_scientific_submission(
file, model_name: str, author: str, description: str
) -> Tuple[str, Optional[pd.DataFrame], str]:
"""Validate uploaded prediction file with scientific rigor."""
try:
if file is None:
return "β Please upload a predictions file", None, "community"
if not model_name.strip():
return "β Please provide a model name", None, "community"
# Handle different file input types
if isinstance(file, bytes):
file_content = file
elif isinstance(file, str):
if os.path.exists(file):
with open(file, "rb") as f:
file_content = f.read()
else:
file_content = file.encode("utf-8")
elif hasattr(file, "name") and os.path.exists(file.name):
with open(file.name, "rb") as f:
file_content = f.read()
else:
return "β Could not read uploaded file", None, "community"
# Determine filename
filename = (
getattr(file, "name", None)
or getattr(file, "filename", None)
or "predictions.csv"
)
# Load test set if needed
global complete_test_set
if complete_test_set is None:
complete_test_set = get_complete_test_set_scientific()
# Run enhanced scientific validation
validation_result = validate_submission_scientific(
file_content, filename, complete_test_set, model_name, author, description
)
detected_category = validation_result.get("category", "community")
# Return predictions if evaluation is possible (even with limitations)
if validation_result.get("can_evaluate", False):
return validation_result["report"], validation_result["predictions"], detected_category
else:
return validation_result["report"], None, detected_category
except Exception as e:
return (
f"β Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}",
None,
"community"
)
def evaluate_scientific_submission(
predictions_df: pd.DataFrame,
model_name: str,
author: str,
description: str,
detected_category: str,
validation_info: Dict,
) -> Tuple[str, pd.DataFrame, object, object]:
"""Evaluate validated predictions using scientific methodology."""
try:
if predictions_df is None:
return "β No valid predictions to evaluate", None, None, None
# Get complete test set with targets
global complete_test_set, current_leaderboard
if complete_test_set is None:
complete_test_set = get_complete_test_set_scientific()
# Run scientific evaluation across all tracks
print(f"π¬ Starting scientific evaluation for {model_name}...")
evaluation_results = evaluate_predictions_scientific(
predictions_df, complete_test_set, detected_category
)
if any(track_data.get('error') for track_data in evaluation_results.get('tracks', {}).values()):
errors = [track_data['error'] for track_data in evaluation_results['tracks'].values() if track_data.get('error')]
return f"β Evaluation errors: {'; '.join(errors)}", None, None, None
# Add to scientific leaderboard
print("π Adding to scientific leaderboard...")
updated_leaderboard = add_model_to_scientific_leaderboard(
model_name=sanitize_model_name(model_name),
author=author or "Anonymous",
evaluation_results=evaluation_results,
model_category=detected_category,
description=description or ""
)
# Update global leaderboard
current_leaderboard = updated_leaderboard
# Generate scientific report
report = generate_scientific_report(evaluation_results, model_name)
# Create visualizations
summary_plot = create_adequacy_analysis_plot(updated_leaderboard)
cross_track_plot = create_cross_track_analysis_plot(updated_leaderboard)
# Prepare display leaderboard (Google-comparable track by default)
google_leaderboard = get_track_leaderboard(updated_leaderboard, "google_comparable")
display_leaderboard = prepare_track_leaderboard_display(google_leaderboard, "google_comparable")
# Format success message with track-specific results
success_msg = f"""
## π Scientific Evaluation Complete!
### π Model Information:
- **Model**: {model_name}
- **Category**: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)}
- **Author**: {author or 'Anonymous'}
### π Track Performance Summary:
"""
tracks = evaluation_results.get('tracks', {})
for track_name, track_data in tracks.items():
if not track_data.get('error'):
track_config = EVALUATION_TRACKS[track_name]
track_averages = track_data.get('track_averages', {})
summary = track_data.get('summary', {})
# Get rank in this track
track_leaderboard = get_track_leaderboard(updated_leaderboard, track_name)
if not track_leaderboard.empty:
model_row = track_leaderboard[track_leaderboard['model_name'] == sanitize_model_name(model_name)]
rank = model_row.index[0] + 1 if not model_row.empty else "N/A"
total_models = len(track_leaderboard)
else:
rank = "N/A"
total_models = 0
quality_score = track_averages.get('quality_score', 0)
bleu_score = track_averages.get('bleu', 0)
samples = summary.get('total_samples', 0)
success_msg += f"""
**π {track_config['name']}**:
- Rank: #{rank} out of {total_models} models
- Quality Score: {quality_score:.4f}
- BLEU: {bleu_score:.2f}
- Samples: {samples:,}
"""
success_msg += f"""
### π¬ Scientific Adequacy:
- **Cross-Track Consistency**: Available in detailed analysis
- **Statistical Confidence**: 95% confidence intervals computed
- **Sample Adequacy**: {validation_info.get('adequacy', {}).get('overall_adequate', 'Unknown')}
{report}
"""
return success_msg, display_leaderboard, summary_plot, cross_track_plot
except Exception as e:
error_msg = f"β Scientific evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
return error_msg, None, None, None
def refresh_track_leaderboard(
track: str,
search_query: str = "",
category_filter: str = "all",
min_adequacy: float = 0.0,
show_ci: bool = True
) -> Tuple[pd.DataFrame, object, object, str]:
"""Refresh leaderboard for a specific track with filters."""
try:
global current_leaderboard
if current_leaderboard is None:
current_leaderboard = load_scientific_leaderboard()
# Get track-specific leaderboard
track_leaderboard = get_track_leaderboard(
current_leaderboard, track, category_filter=category_filter, min_adequacy=min_adequacy
)
# Apply search filter
if search_query:
query_lower = search_query.lower()
mask = (
track_leaderboard['model_name'].str.lower().str.contains(query_lower, na=False) |
track_leaderboard['author'].str.lower().str.contains(query_lower, na=False)
)
track_leaderboard = track_leaderboard[mask]
# Prepare for display
display_df = prepare_track_leaderboard_display(track_leaderboard, track)
# Create plots
ranking_plot = create_scientific_leaderboard_plot(track_leaderboard, track)
comparison_plot = create_statistical_comparison_plot(track_leaderboard, track)
# Get track statistics
track_stats = get_scientific_leaderboard_stats(track_leaderboard, track)
track_config = EVALUATION_TRACKS[track]
stats_text = f"""
### π {track_config['name']} Statistics
- **Total Models**: {track_stats.get('total_models', 0)}
- **Models by Category**: {', '.join([f"{k}: {v}" for k, v in track_stats.get('models_by_category', {}).items()])}
- **Average Quality Score**: {track_stats.get('track_statistics', {}).get(track, {}).get('avg_quality', 0.0):.4f}
**Best Model**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('name', 'None')}
**Best Score**: {track_stats.get('best_models_by_track', {}).get(track, {}).get('quality', 0.0):.4f}
### π¬ Scientific Notes:
- All metrics include 95% confidence intervals
- Statistical adequacy verified for reliable comparisons
- {track_config['description']}
"""
return display_df, ranking_plot, comparison_plot, stats_text
except Exception as e:
error_msg = f"Error loading {track} leaderboard: {str(e)}"
empty_df = pd.DataFrame()
return empty_df, None, None, error_msg
def get_scientific_model_details(model_name: str, track: str) -> Tuple[str, object, object]:
"""Get detailed scientific analysis for a specific model."""
try:
global current_leaderboard
if current_leaderboard is None:
return "Leaderboard not loaded", None, None
# Find model
model_row = current_leaderboard[current_leaderboard['model_name'] == model_name]
if model_row.empty:
return f"Model '{model_name}' not found", None, None
model_info = model_row.iloc[0]
# Parse detailed metrics for the requested track
try:
detailed_results = json.loads(model_info[f'detailed_{track}'])
except:
detailed_results = {}
# Create detailed plots
detail_plot = create_scientific_model_detail_plot(detailed_results, model_name, track)
# Create language pair heatmap
heatmap_plot = create_language_pair_heatmap_scientific(detailed_results, track)
# Format model details with scientific information
track_config = EVALUATION_TRACKS[track]
category_info = MODEL_CATEGORIES.get(model_info['model_category'], {})
# Extract track-specific metrics
quality_col = f"{track}_quality"
bleu_col = f"{track}_bleu"
chrf_col = f"{track}_chrf"
ci_lower_col = f"{track}_ci_lower"
ci_upper_col = f"{track}_ci_upper"
samples_col = f"{track}_samples"
pairs_col = f"{track}_pairs"
adequate_col = f"{track}_adequate"
details_text = f"""
## π¬ Scientific Model Analysis: {model_name}
### π Basic Information:
- **Author**: {model_info['author']}
- **Category**: {category_info.get('name', 'Unknown')} - {category_info.get('description', '')}
- **Submission Date**: {model_info['submission_date'][:10]}
- **Description**: {model_info['description'] or 'No description provided'}
### π {track_config['name']} Performance:
- **Quality Score**: {format_metric_value(model_info.get(quality_col, 0), 'quality_score', True, model_info.get(ci_lower_col, 0), model_info.get(ci_upper_col, 0))}
- **BLEU**: {format_metric_value(model_info.get(bleu_col, 0), 'bleu')}
- **ChrF**: {format_metric_value(model_info.get(chrf_col, 0), 'chrf')}
### π Coverage Information:
- **Total Samples**: {model_info.get(samples_col, 0):,}
- **Language Pairs Covered**: {model_info.get(pairs_col, 0)}
- **Statistical Adequacy**: {'β
Yes' if model_info.get(adequate_col, False) else 'β No'}
### π¬ Statistical Metadata:
- **Confidence Level**: {STATISTICAL_CONFIG['confidence_level']:.0%}
- **Bootstrap Samples**: {STATISTICAL_CONFIG['bootstrap_samples']:,}
- **Scientific Adequacy Score**: {model_info.get('scientific_adequacy_score', 0.0):.3f}
### π Cross-Track Performance:
"""
# Add other track performances for comparison
for other_track in EVALUATION_TRACKS.keys():
if other_track != track:
other_quality_col = f"{other_track}_quality"
other_adequate_col = f"{other_track}_adequate"
if model_info.get(other_adequate_col, False):
other_quality = model_info.get(other_quality_col, 0)
details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: {other_quality:.4f}\n"
else:
details_text += f"- **{EVALUATION_TRACKS[other_track]['name']}**: Not evaluated\n"
details_text += f"""
### π‘ Scientific Interpretation:
- Performance metrics include 95% confidence intervals for reliability
- Statistical adequacy ensures meaningful comparisons with other models
- Cross-track analysis reveals model strengths across different language sets
- Category classification helps contextualize performance expectations
"""
return details_text, detail_plot, heatmap_plot
except Exception as e:
error_msg = f"Error getting model details: {str(e)}"
return error_msg, None, None
def perform_model_comparison(
model_names: List[str], track: str, comparison_type: str = "statistical"
) -> Tuple[str, object]:
"""Perform scientific comparison between selected models."""
try:
global current_leaderboard
if current_leaderboard is None:
return "Leaderboard not loaded", None
if len(model_names) < 2:
return "Please select at least 2 models for comparison", None
# Get models
models = current_leaderboard[current_leaderboard['model_name'].isin(model_names)]
if len(models) < 2:
return "Selected models not found in leaderboard", None
# Perform fair comparison
comparison_result = perform_fair_comparison(current_leaderboard, model_names)
if comparison_result.get('error'):
return f"Comparison error: {comparison_result['error']}", None
# Create comparison visualization
if comparison_type == "statistical":
comparison_plot = create_statistical_comparison_plot(models, track)
else:
comparison_plot = create_category_comparison_plot(models, track)
# Format comparison report
track_config = EVALUATION_TRACKS[track]
comparison_text = f"""
## π¬ Scientific Model Comparison - {track_config['name']}
### π Models Compared:
"""
quality_col = f"{track}_quality"
ci_lower_col = f"{track}_ci_lower"
ci_upper_col = f"{track}_ci_upper"
# Sort models by performance
models_sorted = models.sort_values(quality_col, ascending=False)
for i, (_, model) in enumerate(models_sorted.iterrows(), 1):
category_info = MODEL_CATEGORIES.get(model['model_category'], {})
comparison_text += f"""
**#{i}. {model['model_name']}**
- Category: {category_info.get('name', 'Unknown')}
- Quality Score: {format_metric_value(model[quality_col], 'quality_score', True, model[ci_lower_col], model[ci_upper_col])}
- Author: {model['author']}
"""
# Add statistical analysis
track_comparison = comparison_result.get('track_comparisons', {}).get(track, {})
if track_comparison:
comparison_text += f"""
### π¬ Statistical Analysis:
- **Models with adequate data**: {track_comparison.get('participating_models', 0)}
- **Confidence intervals available**: Yes (95% level)
- **Fair comparison possible**: {'β
Yes' if comparison_result.get('fair_comparison_possible', False) else 'β οΈ Limited'}
"""
# Check for statistical significance (simplified)
quality_scores = list(track_comparison.get('quality_scores', {}).values())
if len(quality_scores) >= 2:
score_range = max(quality_scores) - min(quality_scores)
if score_range > 0.05: # 5% difference threshold
comparison_text += "- **Performance differences**: Potentially significant\n"
else:
comparison_text += "- **Performance differences**: Minimal\n"
# Add recommendations
recommendations = comparison_result.get('recommendations', [])
if recommendations:
comparison_text += "\n### π‘ Recommendations:\n"
for rec in recommendations:
comparison_text += f"- {rec}\n"
return comparison_text, comparison_plot
except Exception as e:
error_msg = f"Error performing comparison: {str(e)}"
return error_msg, None
# Initialize data on startup
print("π Starting SALT Translation Leaderboard - Scientific Edition...")
initialization_success = initialize_scientific_data()
# Create Gradio interface with scientific design
with gr.Blocks(
title=UI_CONFIG["title"],
theme=gr.themes.Soft(),
css="""
.gradio-container {
max-width: 1600px !important;
margin: 0 auto;
}
.scientific-header {
text-align: center;
margin-bottom: 2rem;
padding: 2rem;
background: linear-gradient(135deg, #1e3a8a 0%, #3730a3 50%, #1e40af 100%);
color: white;
border-radius: 10px;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
.track-tab {
border-radius: 8px;
margin: 0.5rem;
padding: 1rem;
border: 2px solid transparent;
}
.track-tab.google-comparable {
border-color: #1f77b4;
background: linear-gradient(45deg, #f0f9ff, #e0f2fe);
}
.track-tab.ug40-complete {
border-color: #ff7f0e;
background: linear-gradient(45deg, #fff7ed, #fed7aa);
}
.track-tab.language-pair-matrix {
border-color: #2ca02c;
background: linear-gradient(45deg, #f0fdf4, #dcfce7);
}
.metric-box {
background: #f8fafc;
padding: 1rem;
border-radius: 8px;
margin: 0.5rem 0;
border-left: 4px solid #3b82f6;
}
.scientific-note {
background: #fef3c7;
border: 1px solid #f59e0b;
border-radius: 8px;
padding: 1rem;
margin: 1rem 0;
}
.adequacy-excellent { border-left-color: #22c55e; }
.adequacy-good { border-left-color: #eab308; }
.adequacy-fair { border-left-color: #f97316; }
.adequacy-insufficient { border-left-color: #ef4444; }
"""
) as demo:
# Scientific Header
gr.HTML(f"""
<div class="scientific-header">
<h1>π SALT Translation Leaderboard - Scientific Edition</h1>
<p><strong>Rigorous Evaluation with Statistical Significance Testing</strong></p>
<p>Three-tier evaluation tracks β’ 95% Confidence intervals β’ Research-grade analysis</p>
<p><strong>Supported Languages</strong>: {len(ALL_UG40_LANGUAGES)} Ugandan languages | <strong>Google Comparable</strong>: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages</p>
</div>
""")
# Status indicator
if initialization_success:
status_msg = "β
Scientific system initialized successfully"
adequacy_info = test_set_stats.get('scientific_adequacy', {}).get('overall_adequacy', 'unknown')
status_msg += f" | Test set adequacy: {adequacy_info.title()}"
else:
status_msg = "β System initialization failed - some features may not work"
gr.Markdown(f"**System Status**: {status_msg}")
# Add scientific overview
gr.Markdown("""
## π¬ Scientific Evaluation Framework
This leaderboard implements rigorous scientific methodology for translation model evaluation:
- **Three Evaluation Tracks**: Fair comparison across different model capabilities
- **Statistical Significance**: 95% confidence intervals and effect size analysis
- **Category-Based Analysis**: Commercial, Research, Baseline, and Community models
- **Cross-Track Consistency**: Validate model performance across language sets
""")
with gr.Tabs():
# Tab 1: Download Test Set
with gr.Tab("π₯ Download Test Set", id="download"):
gr.Markdown("""
## π Get the SALT Scientific Test Set
Download our scientifically designed test set with stratified sampling and statistical weighting.
""")
with gr.Row():
download_btn = gr.Button("π₯ Download Scientific Test Set", variant="primary", size="lg")
with gr.Row():
with gr.Column():
download_file = gr.File(label="π Test Set File", interactive=False)
with gr.Column():
download_info = gr.Markdown(label="βΉοΈ Test Set Information")
# Tab 2: Submit Predictions
with gr.Tab("π Submit Predictions", id="submit"):
gr.Markdown("""
## π― Submit Your Model's Predictions for Scientific Evaluation
Upload predictions for comprehensive evaluation across all three tracks with statistical analysis.
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### π Model Information")
model_name_input = gr.Textbox(
label="π€ Model Name",
placeholder="e.g., MyTranslator-v2.0",
info="Unique name for your model"
)
author_input = gr.Textbox(
label="π€ Author/Organization",
placeholder="Your name or organization",
value="Anonymous"
)
description_input = gr.Textbox(
label="π Model Description",
placeholder="Architecture, training data, special features...",
lines=4,
info="Detailed description helps with proper categorization"
)
gr.Markdown("### π€ Upload Predictions")
predictions_file = gr.File(
label="π Predictions File",
file_types=[".csv", ".tsv", ".json"]
)
validate_btn = gr.Button("β
Validate Submission", variant="secondary")
submit_btn = gr.Button("π Submit for Scientific Evaluation", variant="primary", interactive=False)
with gr.Column(scale=1):
gr.Markdown("### π Validation Results")
validation_output = gr.Markdown()
# Results section
gr.Markdown("### π Scientific Evaluation Results")
with gr.Row():
evaluation_output = gr.Markdown()
with gr.Row():
with gr.Column():
submission_plot = gr.Plot(label="π Submission Analysis")
with gr.Column():
cross_track_plot = gr.Plot(label="π Cross-Track Analysis")
with gr.Row():
results_table = gr.Dataframe(label="π Updated Leaderboard (Google-Comparable Track)", interactive=False)
# Tab 3: Google-Comparable Track
with gr.Tab("π€ Google-Comparable Track", id="google_track", elem_classes=["track-tab", "google-comparable"]):
gr.Markdown(f"""
## {UI_CONFIG['tracks']['google_comparable']['tab_name']}
**Fair comparison with commercial translation systems**
This track evaluates models on the {len(get_google_comparable_pairs())} language pairs supported by Google Translate,
enabling direct comparison with commercial baselines.
- **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
- **Purpose**: Commercial system comparison and baseline establishment
- **Statistical Power**: High (optimized sample sizes)
""")
with gr.Row():
with gr.Column(scale=2):
google_search = gr.Textbox(label="π Search Models", placeholder="Search by model name, author...")
with gr.Column(scale=1):
google_category = gr.Dropdown(
label="π·οΈ Category Filter",
choices=["all"] + list(MODEL_CATEGORIES.keys()),
value="all"
)
with gr.Column(scale=1):
google_adequacy = gr.Slider(
label="π Min Adequacy",
minimum=0.0, maximum=1.0, value=0.0, step=0.1
)
with gr.Column(scale=1):
google_refresh = gr.Button("π Refresh", variant="secondary")
with gr.Row():
google_stats = gr.Markdown()
with gr.Row():
with gr.Column():
google_ranking_plot = gr.Plot(label="π Google-Comparable Rankings")
with gr.Column():
google_comparison_plot = gr.Plot(label="π Statistical Comparison")
with gr.Row():
google_leaderboard = gr.Dataframe(label="π Google-Comparable Leaderboard", interactive=False)
# Tab 4: UG40-Complete Track
with gr.Tab("π UG40-Complete Track", id="ug40_track", elem_classes=["track-tab", "ug40-complete"]):
gr.Markdown(f"""
## {UI_CONFIG['tracks']['ug40_complete']['tab_name']}
**Comprehensive evaluation across all Ugandan languages**
This track evaluates models on all {len(get_all_language_pairs())} UG40 language pairs,
providing the most comprehensive assessment of Ugandan language translation capabilities.
- **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
- **Purpose**: Comprehensive Ugandan language capability assessment
- **Coverage**: Complete linguistic landscape of Uganda
""")
with gr.Row():
with gr.Column(scale=2):
ug40_search = gr.Textbox(label="π Search Models", placeholder="Search by model name, author...")
with gr.Column(scale=1):
ug40_category = gr.Dropdown(
label="π·οΈ Category Filter",
choices=["all"] + list(MODEL_CATEGORIES.keys()),
value="all"
)
with gr.Column(scale=1):
ug40_adequacy = gr.Slider(
label="π Min Adequacy",
minimum=0.0, maximum=1.0, value=0.0, step=0.1
)
with gr.Column(scale=1):
ug40_refresh = gr.Button("π Refresh", variant="secondary")
with gr.Row():
ug40_stats = gr.Markdown()
with gr.Row():
with gr.Column():
ug40_ranking_plot = gr.Plot(label="π UG40-Complete Rankings")
with gr.Column():
ug40_comparison_plot = gr.Plot(label="π Statistical Comparison")
with gr.Row():
ug40_leaderboard = gr.Dataframe(label="π UG40-Complete Leaderboard", interactive=False)
# Tab 5: Language-Pair Matrix
with gr.Tab("π Language-Pair Matrix", id="matrix_track", elem_classes=["track-tab", "language-pair-matrix"]):
gr.Markdown(f"""
## {UI_CONFIG['tracks']['language_pair_matrix']['tab_name']}
**Detailed language pair analysis with statistical significance**
This view provides granular analysis of model performance across individual language pairs
with statistical significance testing and effect size analysis.
- **Resolution**: Individual language pair performance
- **Purpose**: Detailed linguistic analysis and model diagnostics
- **Statistics**: Pairwise significance testing available
""")
with gr.Row():
with gr.Column(scale=2):
matrix_search = gr.Textbox(label="π Search Models", placeholder="Search by model name, author...")
with gr.Column(scale=1):
matrix_category = gr.Dropdown(
label="π·οΈ Category Filter",
choices=["all"] + list(MODEL_CATEGORIES.keys()),
value="all"
)
with gr.Column(scale=1):
matrix_adequacy = gr.Slider(
label="π Min Adequacy",
minimum=0.0, maximum=1.0, value=0.0, step=0.1
)
with gr.Column(scale=1):
matrix_refresh = gr.Button("π Refresh", variant="secondary")
with gr.Row():
matrix_stats = gr.Markdown()
with gr.Row():
with gr.Column():
matrix_ranking_plot = gr.Plot(label="π Language-Pair Matrix Rankings")
with gr.Column():
matrix_comparison_plot = gr.Plot(label="π Statistical Comparison")
with gr.Row():
matrix_leaderboard = gr.Dataframe(label="π Language-Pair Matrix Leaderboard", interactive=False)
# Tab 6: Model Analysis
with gr.Tab("π Scientific Model Analysis", id="analysis"):
gr.Markdown("""
## π¬ Detailed Scientific Model Analysis
Comprehensive analysis of individual models with statistical confidence intervals,
cross-track performance, and detailed language pair breakdowns.
""")
with gr.Row():
with gr.Column(scale=2):
model_select = gr.Dropdown(
label="π€ Select Model",
choices=[],
value=None,
info="Choose a model for detailed scientific analysis"
)
with gr.Column(scale=1):
track_select = gr.Dropdown(
label="π Analysis Track",
choices=list(EVALUATION_TRACKS.keys()),
value="google_comparable",
info="Track for detailed analysis"
)
with gr.Column(scale=1):
analyze_btn = gr.Button("π Analyze", variant="primary")
with gr.Row():
model_details = gr.Markdown()
with gr.Row():
with gr.Column():
model_analysis_plot = gr.Plot(label="π Detailed Performance Analysis")
with gr.Column():
model_heatmap_plot = gr.Plot(label="πΊοΈ Language Pair Heatmap")
# Tab 7: Model Comparison
with gr.Tab("βοΈ Scientific Model Comparison", id="comparison"):
gr.Markdown("""
## π¬ Scientific Model Comparison
Compare multiple models with statistical significance testing and fair comparison analysis.
Only models evaluated on the same language pairs are compared for scientific validity.
""")
with gr.Row():
with gr.Column(scale=2):
comparison_models = gr.CheckboxGroup(
label="π€ Select Models to Compare",
choices=[],
value=[],
info="Select 2-6 models for comparison"
)
with gr.Column(scale=1):
comparison_track = gr.Dropdown(
label="π Comparison Track",
choices=list(EVALUATION_TRACKS.keys()),
value="google_comparable"
)
comparison_type = gr.Radio(
label="π Comparison Type",
choices=["statistical", "category"],
value="statistical"
)
compare_btn = gr.Button("βοΈ Compare Models", variant="primary")
with gr.Row():
comparison_output = gr.Markdown()
with gr.Row():
comparison_plot = gr.Plot(label="π Model Comparison Analysis")
# Tab 8: Documentation
with gr.Tab("π Scientific Documentation", id="docs"):
gr.Markdown(f"""
# π SALT Translation Leaderboard - Scientific Edition Documentation
## π― Overview
The SALT Translation Leaderboard Scientific Edition implements rigorous evaluation methodology
for translation models on Ugandan languages, designed for research publication and scientific analysis.
## π¬ Scientific Methodology
### Three-Tier Evaluation System
**1. π€ Google-Comparable Track**
- **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
- **Pairs**: {len(get_google_comparable_pairs())} language pairs
- **Purpose**: Fair comparison with commercial translation systems
- **Statistical Power**: High (β₯200 samples per pair recommended)
**2. π UG40-Complete Track**
- **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
- **Pairs**: {len(get_all_language_pairs())} language pairs
- **Purpose**: Comprehensive Ugandan language capability assessment
- **Statistical Power**: Moderate (β₯100 samples per pair recommended)
**3. π Language-Pair Matrix**
- **Resolution**: Individual language pair analysis
- **Purpose**: Detailed linguistic analysis and model diagnostics
- **Statistics**: Pairwise significance testing with multiple comparison correction
### Statistical Rigor
- **Confidence Intervals**: 95% confidence intervals using bootstrap sampling ({STATISTICAL_CONFIG['bootstrap_samples']:,} resamples)
- **Significance Testing**: Two-tailed t-tests with {STATISTICAL_CONFIG['multiple_testing_correction'].title()} correction
- **Effect Size**: Cohen's d with interpretation (small: {STATISTICAL_CONFIG['effect_size_thresholds']['small']}, medium: {STATISTICAL_CONFIG['effect_size_thresholds']['medium']}, large: {STATISTICAL_CONFIG['effect_size_thresholds']['large']})
- **Statistical Power**: Estimated based on sample sizes and effect sizes
### Model Categories
Models are automatically categorized for fair comparison:
- **π’ Commercial**: Production translation systems (Google Translate, Azure, etc.)
- **π¬ Research**: Academic and research institution models (NLLB, M2M-100, etc.)
- **π Baseline**: Simple baseline and reference models
- **π₯ Community**: User-submitted models and fine-tuned variants
## π Evaluation Metrics
### Primary Metrics
- **Quality Score**: Composite metric (0-1) combining BLEU, ChrF, error rates, and ROUGE
- **BLEU**: Bilingual Evaluation Understudy (0-100)
- **ChrF**: Character-level F-score (0-1)
### Secondary Metrics
- **ROUGE-1/ROUGE-L**: Recall-oriented metrics for content overlap
- **CER/WER**: Character/Word Error Rate (lower is better)
- **Length Ratio**: Prediction/reference length ratio
All metrics include 95% confidence intervals for statistical reliability.
## π Submission Process
### Step 1: Download Scientific Test Set
1. Click "Download Scientific Test Set" in the first tab
2. Review test set adequacy and track breakdown
3. Save the enhanced test set with statistical weights
### Step 2: Generate Predictions
1. Load the test set in your evaluation pipeline
2. For each row, translate `source_text` from `source_language` to `target_language`
3. Save results as CSV with columns: `sample_id`, `prediction`
4. Optional: Add `category` column for automatic classification
### Step 3: Submit & Evaluate
1. Fill in detailed model information (improves categorization)
2. Upload your predictions file
3. Review validation report with track-specific adequacy assessment
4. Submit for scientific evaluation across all tracks
## π Enhanced File Formats
### Scientific Test Set Format
```csv
sample_id,source_text,source_language,target_language,domain,google_comparable,tracks_included,statistical_weight
salt_000001,"Hello world",eng,lug,general,true,"google_comparable,ug40_complete",2.5
salt_000002,"How are you?",eng,ach,conversation,true,"google_comparable,ug40_complete",2.5
salt_000003,"Good morning",lgg,teo,greetings,false,"ug40_complete,language_pair_matrix",1.0
```
### Predictions Format
```csv
sample_id,prediction,category
salt_000001,"Amakuru ensi","community"
salt_000002,"Ibino nining?","community"
salt_000003,"Ejok nanu","community"
```
## π Scientific Leaderboard Features
### Fair Comparison
- Models only compared within the same category and track
- Statistical significance testing prevents misleading rankings
- Confidence intervals show measurement uncertainty
### Cross-Track Analysis
- Consistency analysis across evaluation tracks
- Identification of model strengths and weaknesses
- Language-specific performance patterns
### Publication Quality
- All visualizations include error bars and statistical annotations
- Comprehensive methodology documentation
- Reproducible evaluation pipeline
## π¬ Statistical Interpretation Guide
### Confidence Intervals
- **Non-overlapping CIs**: Likely significant difference
- **Overlapping CIs**: May or may not be significant (requires formal testing)
- **Wide CIs**: High uncertainty (need more data)
### Effect Sizes
- **Negligible (< {STATISTICAL_CONFIG['effect_size_thresholds']['small']})**: Practical equivalence
- **Small ({STATISTICAL_CONFIG['effect_size_thresholds']['small']}-{STATISTICAL_CONFIG['effect_size_thresholds']['medium']})**: Noticeable difference
- **Medium ({STATISTICAL_CONFIG['effect_size_thresholds']['medium']}-{STATISTICAL_CONFIG['effect_size_thresholds']['large']})**: Substantial difference
- **Large (> {STATISTICAL_CONFIG['effect_size_thresholds']['large']})**: Very large difference
### Statistical Adequacy
- **Excellent**: High statistical power (>0.8) for all comparisons
- **Good**: Adequate power for most comparisons
- **Fair**: Limited power, interpret with caution
- **Insufficient**: Results not reliable for scientific conclusions
## π€ Contributing to Science
This leaderboard is designed for the research community. When using results:
1. **Always report confidence intervals** along with point estimates
2. **Acknowledge statistical adequacy** when interpreting results
3. **Use appropriate track** for your comparison (don't compare Google-track vs UG40-track results)
4. **Consider effect sizes** not just statistical significance
## π Citation
If you use this leaderboard in your research, please cite:
```bibtex
@misc{{salt_leaderboard_scientific_2024,
title={{SALT Translation Leaderboard: Scientific Edition - Rigorous Evaluation of Translation Models on Ugandan Languages}},
author={{Sunbird AI}},
year={{2024}},
url={{https://huggingface.co/spaces/Sunbird/salt-translation-leaderboard-scientific}},
note={{Three-tier evaluation system with statistical significance testing}}
}}
```
## π Related Resources
- **SALT Dataset**: [sunbird/salt](https://huggingface.co/datasets/sunbird/salt)
- **Sunbird AI Research**: [sunbird.ai/research](https://sunbird.ai/research)
- **Statistical Methodology**: See our technical paper on rigorous MT evaluation
- **Open Source Code**: Available on GitHub for reproducibility
---
*For questions about scientific methodology or statistical interpretation, contact our research team at [email protected]*
""")
# Event handlers with enhanced scientific functionality
predictions_validated = gr.State(value=None)
validation_info_state = gr.State(value=None)
detected_category_state = gr.State(value="community")
# Download test set
download_btn.click(
fn=download_scientific_test_set,
outputs=[download_file, download_info]
)
# Validate predictions
def handle_scientific_validation(file, model_name, author, description):
report, predictions, category = validate_scientific_submission(file, model_name, author, description)
# Enable button if predictions are available (allows evaluation with limitations)
can_evaluate = predictions is not None
# Add user-friendly button status message to report
if can_evaluate:
if "π **Final Verdict**: Ready for scientific evaluation!" in report:
button_status = "\n\nβ
**Button Status**: Ready to submit for evaluation!"
elif "β οΈ **Final Verdict**: Can be evaluated with limitations" in report:
button_status = "\n\nβ οΈ **Button Status**: Can submit for evaluation (results will include limitations note)"
else:
button_status = "\n\nβ
**Button Status**: Evaluation possible"
else:
button_status = "\n\nβ **Button Status**: Please fix issues above before evaluation"
enhanced_report = report + button_status
return (
enhanced_report,
predictions,
{"category": category, "validation_passed": can_evaluate},
category,
gr.update(interactive=can_evaluate)
)
validate_btn.click(
fn=handle_scientific_validation,
inputs=[predictions_file, model_name_input, author_input, description_input],
outputs=[validation_output, predictions_validated, validation_info_state, detected_category_state, submit_btn]
)
# Submit for evaluation
def handle_scientific_submission(predictions, model_name, author, description, category, validation_info):
if predictions is None:
return "β Please validate your submission first", None, None, None
return evaluate_scientific_submission(
predictions, model_name, author, description, category, validation_info
)
submit_btn.click(
fn=handle_scientific_submission,
inputs=[predictions_validated, model_name_input, author_input, description_input, detected_category_state, validation_info_state],
outputs=[evaluation_output, results_table, submission_plot, cross_track_plot]
)
# Track leaderboard refresh functions
def refresh_google_track(*args):
return refresh_track_leaderboard("google_comparable", *args)
def refresh_ug40_track(*args):
return refresh_track_leaderboard("ug40_complete", *args)
def refresh_matrix_track(*args):
return refresh_track_leaderboard("language_pair_matrix", *args)
# Google-Comparable Track
google_refresh.click(
fn=refresh_google_track,
inputs=[google_search, google_category, google_adequacy],
outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats]
)
# UG40-Complete Track
ug40_refresh.click(
fn=refresh_ug40_track,
inputs=[ug40_search, ug40_category, ug40_adequacy],
outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats]
)
# Language-Pair Matrix Track
matrix_refresh.click(
fn=refresh_matrix_track,
inputs=[matrix_search, matrix_category, matrix_adequacy],
outputs=[matrix_leaderboard, matrix_ranking_plot, matrix_comparison_plot, matrix_stats]
)
# Model analysis
analyze_btn.click(
fn=get_scientific_model_details,
inputs=[model_select, track_select],
outputs=[model_details, model_analysis_plot, model_heatmap_plot]
)
# Model comparison
compare_btn.click(
fn=perform_model_comparison,
inputs=[comparison_models, comparison_track, comparison_type],
outputs=[comparison_output, comparison_plot]
)
# Load initial data and update dropdowns
def load_initial_data():
# Load initial Google track data
google_data = refresh_google_track("", "all", 0.0)
# Update dropdown choices
if current_leaderboard is not None and not current_leaderboard.empty:
model_choices = current_leaderboard['model_name'].tolist()
else:
model_choices = []
return (
google_data[0], # google_leaderboard
google_data[1], # google_ranking_plot
google_data[2], # google_comparison_plot
google_data[3], # google_stats
gr.Dropdown(choices=model_choices), # model_select
gr.CheckboxGroup(choices=model_choices) # comparison_models
)
demo.load(
fn=load_initial_data,
outputs=[
google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats,
model_select, comparison_models
]
)
# Launch the scientific application
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
show_error=True
) |