Spaces:

atlasia
/

DarijaLID-Leaderboard

Running

File size: 5,789 Bytes

import os
import json
import streamlit as st
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import fasttext
from huggingface_hub import hf_hub_download

# Constants
LEADERBOARD_FILE = 'leaderboard.json'
TEST_SET = 'atlasia/Darija-LID-benchmark'
CACHE_DIR = os.path.join(os.path.dirname(__file__), 'cache')

HF_TOKEN = os.getenv('HF_TOKEN')

def load_leaderboard():
    if os.path.exists(LEADERBOARD_FILE):
        with open(LEADERBOARD_FILE, 'r') as f:
            return json.load(f)
    return []

def save_leaderboard(leaderboard):
    with open(LEADERBOARD_FILE, 'w') as f:
        json.dump(leaderboard, f, indent=2)

def load_test_data() -> list[str]:
    # Create cache directory if it doesn't exist
    os.makedirs(CACHE_DIR, exist_ok=True)
    
    path = hf_hub_download(
        repo_id='atlasia/Darija-LID-private', 
        filename='benchmark.txt', 
        cache_dir=CACHE_DIR, 
        token=HF_TOKEN,
        repo_type='dataset')
    
    with open(path, "r") as f:
        lines = f.readlines()

    samples = list(map(lambda x:x.replace('\n', ''), lines))
    return samples
    
    
def evaluate_predictions(y_true: list[str], y_pred: list[str]) -> dict:
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, pos_label='ary', average="weighted")
    return {
        'accuracy': float(accuracy),
        'precision': float(precision),
        'recall': float(recall),
        'f1': float(f1)
    }

def predict_with_fasttext(model, texts: list[str]) -> list[str]:
    preds = model.predict(texts)
    y_hat = [x[0].split('__label__')[1] for x in preds[0]]
    return y_hat

def load_hf_fasttext_model(model_id):
    model_path = hf_hub_download(repo_id=model_id, filename="model.bin", cache_dir=CACHE_DIR)
    model = fasttext.load_model(model_path)
    os.remove(model_path)
    return model

def load_local_fasttext_model(model_path):
    model = fasttext.load_model(model_path)
    return model

def load_predictions(uploaded_file):
    predictions_df = pd.read_csv(uploaded_file)
    assert 'prediction' in predictions_df.columns, "Predictions file must contain a 'prediction' column"
    y_pred = list(predictions_df['prediction'].values)
    assert set(y_pred) == {'ary', 'other'}, "Predictions must contain only 'ary' or 'other'"
    return y_pred


def main():
    st.title("Darija-LID Model Evaluation")
    st.write("Upload your model or provide a HuggingFace model ID to evaluate it on the Darija-LID test set atlasia/Darija-LID-benchmark.")
    st.write("Currently supports FastText models only. If you're using a different model, you can upload your predictions.")

    # Load test data
    test_data = load_test_data()
    texts = [' '.join(x.split()[1:]) for x in test_data]
    labels = [x.split('__label__')[1].split()[0] for x in test_data]

    # Model input section
    st.header("Model Input")
    model_type = st.radio("Select model type:", ["Local FastText Model", "HuggingFace FastText Model", "Predictions File"])

    if model_type == "Local FastText Model":
        uploaded_file = st.file_uploader("Upload FastText model (.bin)", type=['bin'])
        if uploaded_file:
            with open("temp_model.bin", "wb") as f:
                f.write(uploaded_file.getvalue())
            model = load_local_fasttext_model("temp_model.bin")
            y_pred = predict_with_fasttext(model, texts)
            os.remove("temp_model.bin")

    elif model_type == "HuggingFace FastText Model":
        model_id = st.text_input("Enter HuggingFace model ID:")
        if model_id:
            model = load_hf_fasttext_model(model_id)
            y_pred = predict_with_fasttext(model, texts)

    else:  
        uploaded_file = st.file_uploader("Upload predictions file (CSV with 'prediction' column containing either 'ary' or 'other')", type=['csv'])
        if uploaded_file:
            y_pred = load_predictions(uploaded_file)
            assert len(y_pred) == len(labels), "Predictions and labels must have the same length. Make sure the predictions are for the test set."


    # Evaluation section
    if 'y_pred' in locals():
        st.header("Evaluation Results")
        results = evaluate_predictions(labels, y_pred)
        
        # Display metrics
        col1, col2, col3, col4 = st.columns(4)
        with col1:
            st.metric("Accuracy", f"{results['accuracy']:.4f}")
        with col2:
            st.metric("Precision", f"{results['precision']:.4f}")
        with col3:
            st.metric("Recall", f"{results['recall']:.4f}")
        with col4:
            st.metric("F1 Score", f"{results['f1']:.4f}")

        # Leaderboard submission
        st.header("Submit to Leaderboard")
        submitter_name = st.text_input("Your Name:")
        if st.button("Submit to Leaderboard"):
            if submitter_name:
                leaderboard = load_leaderboard()
                entry = {
                    'name': submitter_name,
                    'model_type': model_type,
                    'model_id': model_id if model_type == "HuggingFace Model" else "uploaded_file",
                    **results
                }
                leaderboard.append(entry)
                save_leaderboard(leaderboard)
                st.success("Successfully submitted to leaderboard!")
            else:
                st.error("Please enter your name to submit to the leaderboard.")

    # Display leaderboard
    st.header("Leaderboard")
    leaderboard = load_leaderboard()
    if leaderboard:
        df = pd.DataFrame(leaderboard)
        df = df.sort_values('f1', ascending=False)
        st.dataframe(df)
    else:
        st.write("No submissions yet.")

if __name__ == "__main__":
    main()