DarijaLID-Leaderboard / src /streamlit_app.py
imomayiz's picture
Update src/streamlit_app.py
7b27b9c verified
import os
import json
import streamlit as st
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import fasttext
from huggingface_hub import hf_hub_download
# Constants
LEADERBOARD_FILE = 'leaderboard.json'
TEST_SET = 'atlasia/Darija-LID-benchmark'
CACHE_DIR = os.path.join(os.path.dirname(__file__), 'cache')
HF_TOKEN = os.getenv('HF_TOKEN')
def load_leaderboard():
if os.path.exists(LEADERBOARD_FILE):
with open(LEADERBOARD_FILE, 'r') as f:
return json.load(f)
return []
def save_leaderboard(leaderboard):
with open(LEADERBOARD_FILE, 'w') as f:
json.dump(leaderboard, f, indent=2)
def load_test_data() -> list[str]:
# Create cache directory if it doesn't exist
os.makedirs(CACHE_DIR, exist_ok=True)
path = hf_hub_download(
repo_id='atlasia/Darija-LID-private',
filename='benchmark.txt',
cache_dir=CACHE_DIR,
token=HF_TOKEN,
repo_type='dataset')
with open(path, "r") as f:
lines = f.readlines()
samples = list(map(lambda x:x.replace('\n', ''), lines))
return samples
def evaluate_predictions(y_true: list[str], y_pred: list[str]) -> dict:
accuracy = accuracy_score(y_true, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, pos_label='ary', average="weighted")
return {
'accuracy': float(accuracy),
'precision': float(precision),
'recall': float(recall),
'f1': float(f1)
}
def predict_with_fasttext(model, texts: list[str]) -> list[str]:
preds = model.predict(texts)
y_hat = [x[0].split('__label__')[1] for x in preds[0]]
return y_hat
def load_hf_fasttext_model(model_id):
model_path = hf_hub_download(repo_id=model_id, filename="model.bin", cache_dir=CACHE_DIR)
model = fasttext.load_model(model_path)
os.remove(model_path)
return model
def load_local_fasttext_model(model_path):
model = fasttext.load_model(model_path)
return model
def load_predictions(uploaded_file):
predictions_df = pd.read_csv(uploaded_file)
assert 'prediction' in predictions_df.columns, "Predictions file must contain a 'prediction' column"
y_pred = list(predictions_df['prediction'].values)
assert set(y_pred) == {'ary', 'other'}, "Predictions must contain only 'ary' or 'other'"
return y_pred
def main():
st.title("Darija-LID Model Evaluation")
st.write("Upload your model or provide a HuggingFace model ID to evaluate it on the Darija-LID test set atlasia/Darija-LID-benchmark.")
st.write("Currently supports FastText models only. If you're using a different model, you can upload your predictions.")
# Load test data
test_data = load_test_data()
texts = [' '.join(x.split()[1:]) for x in test_data]
labels = [x.split('__label__')[1].split()[0] for x in test_data]
# Model input section
st.header("Model Input")
model_type = st.radio("Select model type:", ["Local FastText Model", "HuggingFace FastText Model", "Predictions File"])
if model_type == "Local FastText Model":
uploaded_file = st.file_uploader("Upload FastText model (.bin)", type=['bin'])
if uploaded_file:
with open("temp_model.bin", "wb") as f:
f.write(uploaded_file.getvalue())
model = load_local_fasttext_model("temp_model.bin")
y_pred = predict_with_fasttext(model, texts)
os.remove("temp_model.bin")
elif model_type == "HuggingFace FastText Model":
model_id = st.text_input("Enter HuggingFace model ID:")
if model_id:
model = load_hf_fasttext_model(model_id)
y_pred = predict_with_fasttext(model, texts)
else:
uploaded_file = st.file_uploader("Upload predictions file (CSV with 'prediction' column containing either 'ary' or 'other')", type=['csv'])
if uploaded_file:
y_pred = load_predictions(uploaded_file)
assert len(y_pred) == len(labels), "Predictions and labels must have the same length. Make sure the predictions are for the test set."
# Evaluation section
if 'y_pred' in locals():
st.header("Evaluation Results")
results = evaluate_predictions(labels, y_pred)
# Display metrics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Accuracy", f"{results['accuracy']:.4f}")
with col2:
st.metric("Precision", f"{results['precision']:.4f}")
with col3:
st.metric("Recall", f"{results['recall']:.4f}")
with col4:
st.metric("F1 Score", f"{results['f1']:.4f}")
# Leaderboard submission
st.header("Submit to Leaderboard")
submitter_name = st.text_input("Your Name:")
if st.button("Submit to Leaderboard"):
if submitter_name:
leaderboard = load_leaderboard()
entry = {
'name': submitter_name,
'model_type': model_type,
'model_id': model_id if model_type == "HuggingFace Model" else "uploaded_file",
**results
}
leaderboard.append(entry)
save_leaderboard(leaderboard)
st.success("Successfully submitted to leaderboard!")
else:
st.error("Please enter your name to submit to the leaderboard.")
# Display leaderboard
st.header("Leaderboard")
leaderboard = load_leaderboard()
if leaderboard:
df = pd.DataFrame(leaderboard)
df = df.sort_values('f1', ascending=False)
st.dataframe(df)
else:
st.write("No submissions yet.")
if __name__ == "__main__":
main()