Spaces:

mideind
/

icelandic-llm-leaderboard

Running

File size: 6,879 Bytes

import React from "react";
import {
  Box,
  Typography,
  Paper,
  Link,
  Alert,
  Divider,
} from "@mui/material";
import PageHeader from "../../components/shared/PageHeader";

const benchmarks = [
  {
    name: "WinoGrande-IS",
    description: "The Icelandic WinoGrande task is a human-translated and localized version of the ~1000 test set examples in the WinoGrande task in English. Each example consists of a sentence with a blank, and two answer choices for the blank. The task is to choose the correct answer choice using coreference resolution. The benchmark is designed to test the model's ability to use knowledge and common sense reasoning in Icelandic. For this benchmark, we use 3-shot evaluation.",
    evaluation: "3-shot, exact match",
    datasetUrl: "https://huggingface.co/datasets/mideind/icelandic-winogrande",
    paperUrl: "https://aclanthology.org/2022.lrec-1.464.pdf",
    paperTitle: "IceBERT paper"
  },
  {
    name: "GED",
    description: "This is a benchmark for binary sentence-level Icelandic grammatical error detection, adapted from the Icelandic Error Corpus (IEC) and contains 200 examples. Each example consists of a sentence that may contain one or more grammatical errors, and the task is to predict whether the sentence contains an error.",
    evaluation: "Exact match",
    datasetUrl: "https://huggingface.co/datasets/mideind/icelandic-sentences-gec"
  },
  {
    name: "Inflection",
    description: "The inflection benchmark tests models' ability to generate inflected forms of 300 Icelandic adjective-noun pairs for all four cases, singular and plural.",
    evaluation: "1-shot, exact match",
    datasetUrl: "https://huggingface.co/datasets/mideind/icelandic-inflection-all-flat"
  },
  {
    name: "Belebele (IS)",
    description: "This is the Icelandic subset (900 examples) of the Belebele benchmark, a multiple-choice reading comprehension task. The task is to answer questions about a given passage.",
    evaluation: "Exact match",
    datasetUrl: "https://huggingface.co/datasets/facebook/belebele"
  },
  {
    name: "ARC-Challenge-IS",
    description: "A machine-translated version of the ARC-Challenge multiple-choice question-answering dataset. For this benchmark, we use the test set which contains 1.23k examples.",
    evaluation: "Exact match",
    datasetUrl: "https://huggingface.co/datasets/mideind/icelandic-arc-challenge"
  },
  {
    name: "WikiQA-IS",
    description: "The Icelandic WikiQA dataset is a collection of 1.9k question-answer pairs from the Icelandic Wikipedia, meant to evaluate models' knowledge of Icelandic culture and history. They were collected by making GPT-4o generate questions and answers given Icelandic Wikipedia articles as context. All examples were then manually verified and corrected where necessary. For evaluation, we prompt GPT-4o to compare the generated answer to the original answer for semantic similarity and rate the answer on the following scale: (0, \"poor\"), (1, \"fair\"), (2, \"excellent\").",
    evaluation: "LLM judge scoring (GPT-4o)",
    datasetUrl: "https://huggingface.co/datasets/mideind/icelandic_wiki_qa"
  }
];

const BenchmarkCard = ({ benchmark }) => {
  return (
    <Paper
      elevation={0}
      sx={{
        p: 3,
        border: "1px solid",
        borderColor: "grey.200",
        backgroundColor: "transparent",
        borderRadius: 2,
        mb: 3,
      }}
    >
      <Typography variant="h6" sx={{ mb: 2, color: "primary.main" }}>
        {benchmark.name}
      </Typography>
      
      <Typography variant="body1" sx={{ mb: 2, lineHeight: 1.6 }}>
        {benchmark.description}
      </Typography>
      
      <Box sx={{ mb: 2 }}>
        <Typography variant="body2" color="text.secondary" sx={{ mb: 1 }}>
          <strong>Evaluation:</strong> {benchmark.evaluation}
        </Typography>
      </Box>
      
      <Box sx={{ display: "flex", flexDirection: "column", gap: 1 }}>
        <Link
          href={benchmark.datasetUrl}
          target="_blank"
          rel="noopener noreferrer"
          sx={{ fontSize: "0.875rem" }}
        >
          View dataset →
        </Link>
        {benchmark.paperUrl && (
          <Link
            href={benchmark.paperUrl}
            target="_blank"
            rel="noopener noreferrer"
            sx={{ fontSize: "0.875rem" }}
          >
            {benchmark.paperTitle} →
          </Link>
        )}
      </Box>
    </Paper>
  );
};

function AboutPage() {
  return (
    <Box sx={{ width: "100%", maxWidth: 1200, margin: "0 auto", py: 4, px: 0 }}>
      <PageHeader
        title="About the Icelandic LLM Leaderboard"
        subtitle="Evaluating language models on Icelandic language tasks"
      />

      <Box sx={{ mb: 6 }}>
        <Typography variant="h5" sx={{ mb: 3 }}>
          References
        </Typography>
        <Typography variant="body1" sx={{ mb: 4, lineHeight: 1.6 }}>
          This leaderboard is an adapted version of the archived{" "}
          <Link
            href="https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard"
            target="_blank"
            rel="noopener noreferrer"
          >
            Open LLM Leaderboard
          </Link>
          , and the underlying evaluation framework is powered by the{" "}
          <Link
            href="https://github.com/EleutherAI/lm-evaluation-harness"
            target="_blank"
            rel="noopener noreferrer"
          >
            LM Evaluation Harness
          </Link>
          .
        </Typography>
      </Box>

      <Divider sx={{ my: 4 }} />

      <Box sx={{ mb: 6 }}>
        <Typography variant="h5" sx={{ mb: 3 }}>
          New Submissions
        </Typography>
        <Typography variant="body1">
          Do you want your model to be included on the leaderboard? Open a discussion on this repository 
          with the details of your model and we will get back to you.
        </Typography>
      </Box>

      <Divider sx={{ my: 4 }} />

      <Box sx={{ mb: 6 }}>
        <Typography variant="h5" sx={{ mb: 3 }}>
          Benchmark Tasks
        </Typography>
        <Typography variant="body1" sx={{ mb: 4, lineHeight: 1.6 }}>
          The Icelandic LLM leaderboard evaluates models on several tasks. All of them are set up as generation tasks, 
          where the model's output is compared to the expected output. This means that models that have not been 
          instruction fine-tuned might perform poorly on these tasks.
        </Typography>
        
        <Typography variant="body1" sx={{ mb: 4, fontWeight: 600 }}>
          The following tasks are evaluated:
        </Typography>

        <Box>
          {benchmarks.map((benchmark, index) => (
            <BenchmarkCard key={index} benchmark={benchmark} />
          ))}
        </Box>
      </Box>
    </Box>
  );
}

export default AboutPage;