File size: 6,879 Bytes
6999d55 8510329 6999d55 cc7fdc5 6999d55 cc7fdc5 6999d55 cc7fdc5 6999d55 8510329 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import React from "react";
import {
Box,
Typography,
Paper,
Link,
Alert,
Divider,
} from "@mui/material";
import PageHeader from "../../components/shared/PageHeader";
const benchmarks = [
{
name: "WinoGrande-IS",
description: "The Icelandic WinoGrande task is a human-translated and localized version of the ~1000 test set examples in the WinoGrande task in English. Each example consists of a sentence with a blank, and two answer choices for the blank. The task is to choose the correct answer choice using coreference resolution. The benchmark is designed to test the model's ability to use knowledge and common sense reasoning in Icelandic. For this benchmark, we use 3-shot evaluation.",
evaluation: "3-shot, exact match",
datasetUrl: "https://huggingface.co/datasets/mideind/icelandic-winogrande",
paperUrl: "https://aclanthology.org/2022.lrec-1.464.pdf",
paperTitle: "IceBERT paper"
},
{
name: "GED",
description: "This is a benchmark for binary sentence-level Icelandic grammatical error detection, adapted from the Icelandic Error Corpus (IEC) and contains 200 examples. Each example consists of a sentence that may contain one or more grammatical errors, and the task is to predict whether the sentence contains an error.",
evaluation: "Exact match",
datasetUrl: "https://huggingface.co/datasets/mideind/icelandic-sentences-gec"
},
{
name: "Inflection",
description: "The inflection benchmark tests models' ability to generate inflected forms of 300 Icelandic adjective-noun pairs for all four cases, singular and plural.",
evaluation: "1-shot, exact match",
datasetUrl: "https://huggingface.co/datasets/mideind/icelandic-inflection-all-flat"
},
{
name: "Belebele (IS)",
description: "This is the Icelandic subset (900 examples) of the Belebele benchmark, a multiple-choice reading comprehension task. The task is to answer questions about a given passage.",
evaluation: "Exact match",
datasetUrl: "https://huggingface.co/datasets/facebook/belebele"
},
{
name: "ARC-Challenge-IS",
description: "A machine-translated version of the ARC-Challenge multiple-choice question-answering dataset. For this benchmark, we use the test set which contains 1.23k examples.",
evaluation: "Exact match",
datasetUrl: "https://huggingface.co/datasets/mideind/icelandic-arc-challenge"
},
{
name: "WikiQA-IS",
description: "The Icelandic WikiQA dataset is a collection of 1.9k question-answer pairs from the Icelandic Wikipedia, meant to evaluate models' knowledge of Icelandic culture and history. They were collected by making GPT-4o generate questions and answers given Icelandic Wikipedia articles as context. All examples were then manually verified and corrected where necessary. For evaluation, we prompt GPT-4o to compare the generated answer to the original answer for semantic similarity and rate the answer on the following scale: (0, \"poor\"), (1, \"fair\"), (2, \"excellent\").",
evaluation: "LLM judge scoring (GPT-4o)",
datasetUrl: "https://huggingface.co/datasets/mideind/icelandic_wiki_qa"
}
];
const BenchmarkCard = ({ benchmark }) => {
return (
<Paper
elevation={0}
sx={{
p: 3,
border: "1px solid",
borderColor: "grey.200",
backgroundColor: "transparent",
borderRadius: 2,
mb: 3,
}}
>
<Typography variant="h6" sx={{ mb: 2, color: "primary.main" }}>
{benchmark.name}
</Typography>
<Typography variant="body1" sx={{ mb: 2, lineHeight: 1.6 }}>
{benchmark.description}
</Typography>
<Box sx={{ mb: 2 }}>
<Typography variant="body2" color="text.secondary" sx={{ mb: 1 }}>
<strong>Evaluation:</strong> {benchmark.evaluation}
</Typography>
</Box>
<Box sx={{ display: "flex", flexDirection: "column", gap: 1 }}>
<Link
href={benchmark.datasetUrl}
target="_blank"
rel="noopener noreferrer"
sx={{ fontSize: "0.875rem" }}
>
View dataset →
</Link>
{benchmark.paperUrl && (
<Link
href={benchmark.paperUrl}
target="_blank"
rel="noopener noreferrer"
sx={{ fontSize: "0.875rem" }}
>
{benchmark.paperTitle} →
</Link>
)}
</Box>
</Paper>
);
};
function AboutPage() {
return (
<Box sx={{ width: "100%", maxWidth: 1200, margin: "0 auto", py: 4, px: 0 }}>
<PageHeader
title="About the Icelandic LLM Leaderboard"
subtitle="Evaluating language models on Icelandic language tasks"
/>
<Box sx={{ mb: 6 }}>
<Typography variant="h5" sx={{ mb: 3 }}>
References
</Typography>
<Typography variant="body1" sx={{ mb: 4, lineHeight: 1.6 }}>
This leaderboard is an adapted version of the archived{" "}
<Link
href="https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard"
target="_blank"
rel="noopener noreferrer"
>
Open LLM Leaderboard
</Link>
, and the underlying evaluation framework is powered by the{" "}
<Link
href="https://github.com/EleutherAI/lm-evaluation-harness"
target="_blank"
rel="noopener noreferrer"
>
LM Evaluation Harness
</Link>
.
</Typography>
</Box>
<Divider sx={{ my: 4 }} />
<Box sx={{ mb: 6 }}>
<Typography variant="h5" sx={{ mb: 3 }}>
New Submissions
</Typography>
<Typography variant="body1">
Do you want your model to be included on the leaderboard? Open a discussion on this repository
with the details of your model and we will get back to you.
</Typography>
</Box>
<Divider sx={{ my: 4 }} />
<Box sx={{ mb: 6 }}>
<Typography variant="h5" sx={{ mb: 3 }}>
Benchmark Tasks
</Typography>
<Typography variant="body1" sx={{ mb: 4, lineHeight: 1.6 }}>
The Icelandic LLM leaderboard evaluates models on several tasks. All of them are set up as generation tasks,
where the model's output is compared to the expected output. This means that models that have not been
instruction fine-tuned might perform poorly on these tasks.
</Typography>
<Typography variant="body1" sx={{ mb: 4, fontWeight: 600 }}>
The following tasks are evaluated:
</Typography>
<Box>
{benchmarks.map((benchmark, index) => (
<BenchmarkCard key={index} benchmark={benchmark} />
))}
</Box>
</Box>
</Box>
);
}
export default AboutPage;
|