mohalisad commited on
Commit
5f3c12c
·
verified ·
1 Parent(s): 42e0116

Update leaderboard_data.jsonl

Browse files
Files changed (1) hide show
  1. leaderboard_data.jsonl +2 -2
leaderboard_data.jsonl CHANGED
@@ -12,8 +12,8 @@
12
  {"Model": "gemma-3-1b-it", "#Params (B)": 0.99, "Precision": "BF16", "model_name_for_query": "google/gemma-3-1b-it", "GeneralKnowledge": 26.02, "GSM8K": 4.3, "DC-Homograph": 49.07, "MC-Homograph": 51.15, "PiQA": 57.66, "Proverb-Quiz": 28.92, "VerbEval": 27.67, "Winogrande": 50.58, "Arc-Challenge": 36.43, "Arc-Easy": 46.1, "Feqh": 28.0, "Hallucination (Truthfulness)": 54.94, "P-Hellaswag": 63.92, "Law": 20.33, "AUT Multiple Choice": 29.1, "Parsi Literature": 24.97, "BoolQA": 63.9, "Reading Comprehension": 31.98, "PartExpert": 27.22, "MMLU Pro": 13.7, "Iranian Social Norms": 51.22, "Model sha": "dcc83ea841ab6100d6b47a070329e1ba4cf78752", "Hub License": "gemma"}
13
  {"Model": "gemma-3-27b-it", "#Params (B)": 27.43, "Precision": "BF16", "model_name_for_query": "google/gemma-3-27b-it", "GeneralKnowledge": 73.72, "GSM8K": 28.3, "DC-Homograph": 63.89, "MC-Homograph": 92.4, "PiQA": 87.29, "Proverb-Quiz": 78.92, "VerbEval": 66.02, "Winogrande": 78.12, "Arc-Challenge": 88.35, "Arc-Easy": 94.22, "Feqh": 24.57, "Hallucination (Truthfulness)": 60.15, "P-Hellaswag": 83.39, "Law": 36.33, "AUT Multiple Choice": 55.2, "Parsi Literature": 40.93, "BoolQA": 91.4, "Reading Comprehension": 58.01, "PartExpert": 49.32, "MMLU Pro": 36.6, "Iranian Social Norms": 70.49, "Model sha": "005ad3404e59d6023443cb575daa05336842228a", "Hub License": "gemma"}
14
  {"Model": "gemma-3-4b-it", "#Params (B)": 4.3, "Precision": "BF16", "model_name_for_query": "google/gemma-3-4b-it", "GeneralKnowledge": 45.92, "GSM8K": 9.6, "DC-Homograph": 42.59, "MC-Homograph": 72.58, "PiQA": 72.77, "Proverb-Quiz": 53.78, "VerbEval": 45.3, "Winogrande": 55.09, "Arc-Challenge": 63.46, "Arc-Easy": 79.57, "Feqh": 21.14, "Hallucination (Truthfulness)": 46.04, "P-Hellaswag": 73.84, "Law": 27.67, "AUT Multiple Choice": 42.5, "Parsi Literature": 30.24, "BoolQA": 78.6, "Reading Comprehension": 47.28, "PartExpert": 34.7, "MMLU Pro": 22.8, "Iranian Social Norms": 65.55, "Model sha": "093f9f388b31de276ce2de164bdc2081324b9767", "Hub License": "gemma"}
15
- {"Model": "google__gemini-2.0-flash-001", "#Params (B)": "unknown", "Precision": "unknown", "model_name_for_query": null, "GeneralKnowledge": 87.76, "GSM8K": 53.7, "DC-Homograph": 79.63, "MC-Homograph": 91.71, "PiQA": 90.59, "Proverb-Quiz": 95.14, "VerbEval": 85.15, "Winogrande": 78.74, "Arc-Challenge": 91.35, "Arc-Easy": 97.22, "Feqh": 53.14, "Hallucination (Truthfulness)": 68.87, "P-Hellaswag": 82.95, "Law": 45.67, "AUT Multiple Choice": 60.9, "Parsi Literature": 44.02, "BoolQA": 91.3, "Reading Comprehension": 67.92, "PartExpert": 59.5, "MMLU Pro": 47.8, "Iranian Social Norms": 77.68, "Model sha": "unknown", "Hub License": "unknown"}
16
- {"Model": "google__gemini-2.0-flash-lite-001", "#Params (B)": "unknown", "Precision": "unknown", "model_name_for_query": null, "GeneralKnowledge": 84.18, "GSM8K": 39.7, "DC-Homograph": 60.19, "MC-Homograph": 87.79, "PiQA": 85.29, "Proverb-Quiz": 91.35, "VerbEval": 81.39, "Winogrande": 75.64, "Arc-Challenge": 89.64, "Arc-Easy": 93.48, "Feqh": 41.71, "Hallucination (Truthfulness)": 67.32, "P-Hellaswag": 83.54, "Law": 43.0, "AUT Multiple Choice": 58.5, "Parsi Literature": 43.89, "BoolQA": 92.6, "Reading Comprehension": 65.92, "PartExpert": 54.15, "MMLU Pro": 41.2, "Iranian Social Norms": 70.49, "Model sha": "unknown", "Hub License": "unknown"}
17
  {"Model": "gpt-4.1-2025-04-14", "#Params (B)": "unknown", "Precision": "unknown", "model_name_for_query": null, "GeneralKnowledge": 90.82, "GSM8K": 25.3, "DC-Homograph": 89.81, "MC-Homograph": 95.39, "PiQA": 95.9, "Proverb-Quiz": 95.14, "VerbEval": 83.04, "Winogrande": 85.92, "Arc-Challenge": 95.3, "Arc-Easy": 96.68, "Feqh": 52.0, "Hallucination (Truthfulness)": 77.43, "P-Hellaswag": 85.67, "Law": 53.67, "AUT Multiple Choice": 66.6, "Parsi Literature": 45.82, "BoolQA": 94.7, "Reading Comprehension": 44.82, "PartExpert": 59.92, "MMLU Pro": 50.5, "Iranian Social Norms": 77.56, "Model sha": "unknown", "Hub License": "unknown"}
18
  {"Model": "gpt-4.1-mini-2025-04-14", "#Params (B)": "unknown", "Precision": "unknown", "model_name_for_query": null, "GeneralKnowledge": 79.34, "GSM8K": 60.3, "DC-Homograph": 66.67, "MC-Homograph": 94.24, "PiQA": 92.69, "Proverb-Quiz": 82.97, "VerbEval": 77.99, "Winogrande": 80.07, "Arc-Challenge": 91.88, "Arc-Easy": 96.15, "Feqh": 37.71, "Hallucination (Truthfulness)": 66.55, "P-Hellaswag": 84.57, "Law": 44.33, "AUT Multiple Choice": 53.5, "Parsi Literature": 41.18, "BoolQA": 93.7, "Reading Comprehension": 51.85, "PartExpert": 54.37, "MMLU Pro": 47.8, "Iranian Social Norms": 73.35, "Model sha": "unknown", "Hub License": "unknown"}
19
  {"Model": "gpt-4.1-nano-2025-04-14", "#Params (B)": "unknown", "Precision": "unknown", "model_name_for_query": null, "GeneralKnowledge": 68.11, "GSM8K": 58.4, "DC-Homograph": 49.07, "MC-Homograph": 78.11, "PiQA": 84.58, "Proverb-Quiz": 67.84, "VerbEval": 66.21, "Winogrande": 60.32, "Arc-Challenge": 81.41, "Arc-Easy": 91.55, "Feqh": 32.0, "Hallucination (Truthfulness)": 51.24, "P-Hellaswag": 77.96, "Law": 32.67, "AUT Multiple Choice": 46.1, "Parsi Literature": 36.42, "BoolQA": 81.7, "Reading Comprehension": 50.66, "PartExpert": 42.49, "MMLU Pro": 29.9, "Iranian Social Norms": 74.76, "Model sha": "unknown", "Hub License": "unknown"}
 
12
  {"Model": "gemma-3-1b-it", "#Params (B)": 0.99, "Precision": "BF16", "model_name_for_query": "google/gemma-3-1b-it", "GeneralKnowledge": 26.02, "GSM8K": 4.3, "DC-Homograph": 49.07, "MC-Homograph": 51.15, "PiQA": 57.66, "Proverb-Quiz": 28.92, "VerbEval": 27.67, "Winogrande": 50.58, "Arc-Challenge": 36.43, "Arc-Easy": 46.1, "Feqh": 28.0, "Hallucination (Truthfulness)": 54.94, "P-Hellaswag": 63.92, "Law": 20.33, "AUT Multiple Choice": 29.1, "Parsi Literature": 24.97, "BoolQA": 63.9, "Reading Comprehension": 31.98, "PartExpert": 27.22, "MMLU Pro": 13.7, "Iranian Social Norms": 51.22, "Model sha": "dcc83ea841ab6100d6b47a070329e1ba4cf78752", "Hub License": "gemma"}
13
  {"Model": "gemma-3-27b-it", "#Params (B)": 27.43, "Precision": "BF16", "model_name_for_query": "google/gemma-3-27b-it", "GeneralKnowledge": 73.72, "GSM8K": 28.3, "DC-Homograph": 63.89, "MC-Homograph": 92.4, "PiQA": 87.29, "Proverb-Quiz": 78.92, "VerbEval": 66.02, "Winogrande": 78.12, "Arc-Challenge": 88.35, "Arc-Easy": 94.22, "Feqh": 24.57, "Hallucination (Truthfulness)": 60.15, "P-Hellaswag": 83.39, "Law": 36.33, "AUT Multiple Choice": 55.2, "Parsi Literature": 40.93, "BoolQA": 91.4, "Reading Comprehension": 58.01, "PartExpert": 49.32, "MMLU Pro": 36.6, "Iranian Social Norms": 70.49, "Model sha": "005ad3404e59d6023443cb575daa05336842228a", "Hub License": "gemma"}
14
  {"Model": "gemma-3-4b-it", "#Params (B)": 4.3, "Precision": "BF16", "model_name_for_query": "google/gemma-3-4b-it", "GeneralKnowledge": 45.92, "GSM8K": 9.6, "DC-Homograph": 42.59, "MC-Homograph": 72.58, "PiQA": 72.77, "Proverb-Quiz": 53.78, "VerbEval": 45.3, "Winogrande": 55.09, "Arc-Challenge": 63.46, "Arc-Easy": 79.57, "Feqh": 21.14, "Hallucination (Truthfulness)": 46.04, "P-Hellaswag": 73.84, "Law": 27.67, "AUT Multiple Choice": 42.5, "Parsi Literature": 30.24, "BoolQA": 78.6, "Reading Comprehension": 47.28, "PartExpert": 34.7, "MMLU Pro": 22.8, "Iranian Social Norms": 65.55, "Model sha": "093f9f388b31de276ce2de164bdc2081324b9767", "Hub License": "gemma"}
15
+ {"Model": "gemini-2.0-flash-001", "#Params (B)": "unknown", "Precision": "unknown", "model_name_for_query": null, "GeneralKnowledge": 87.76, "GSM8K": 53.7, "DC-Homograph": 79.63, "MC-Homograph": 91.71, "PiQA": 90.59, "Proverb-Quiz": 95.14, "VerbEval": 85.15, "Winogrande": 78.74, "Arc-Challenge": 91.35, "Arc-Easy": 97.22, "Feqh": 53.14, "Hallucination (Truthfulness)": 68.87, "P-Hellaswag": 82.95, "Law": 45.67, "AUT Multiple Choice": 60.9, "Parsi Literature": 44.02, "BoolQA": 91.3, "Reading Comprehension": 67.92, "PartExpert": 59.5, "MMLU Pro": 47.8, "Iranian Social Norms": 77.68, "Model sha": "unknown", "Hub License": "unknown"}
16
+ {"Model": "gemini-2.0-flash-lite-001", "#Params (B)": "unknown", "Precision": "unknown", "model_name_for_query": null, "GeneralKnowledge": 84.18, "GSM8K": 39.7, "DC-Homograph": 60.19, "MC-Homograph": 87.79, "PiQA": 85.29, "Proverb-Quiz": 91.35, "VerbEval": 81.39, "Winogrande": 75.64, "Arc-Challenge": 89.64, "Arc-Easy": 93.48, "Feqh": 41.71, "Hallucination (Truthfulness)": 67.32, "P-Hellaswag": 83.54, "Law": 43.0, "AUT Multiple Choice": 58.5, "Parsi Literature": 43.89, "BoolQA": 92.6, "Reading Comprehension": 65.92, "PartExpert": 54.15, "MMLU Pro": 41.2, "Iranian Social Norms": 70.49, "Model sha": "unknown", "Hub License": "unknown"}
17
  {"Model": "gpt-4.1-2025-04-14", "#Params (B)": "unknown", "Precision": "unknown", "model_name_for_query": null, "GeneralKnowledge": 90.82, "GSM8K": 25.3, "DC-Homograph": 89.81, "MC-Homograph": 95.39, "PiQA": 95.9, "Proverb-Quiz": 95.14, "VerbEval": 83.04, "Winogrande": 85.92, "Arc-Challenge": 95.3, "Arc-Easy": 96.68, "Feqh": 52.0, "Hallucination (Truthfulness)": 77.43, "P-Hellaswag": 85.67, "Law": 53.67, "AUT Multiple Choice": 66.6, "Parsi Literature": 45.82, "BoolQA": 94.7, "Reading Comprehension": 44.82, "PartExpert": 59.92, "MMLU Pro": 50.5, "Iranian Social Norms": 77.56, "Model sha": "unknown", "Hub License": "unknown"}
18
  {"Model": "gpt-4.1-mini-2025-04-14", "#Params (B)": "unknown", "Precision": "unknown", "model_name_for_query": null, "GeneralKnowledge": 79.34, "GSM8K": 60.3, "DC-Homograph": 66.67, "MC-Homograph": 94.24, "PiQA": 92.69, "Proverb-Quiz": 82.97, "VerbEval": 77.99, "Winogrande": 80.07, "Arc-Challenge": 91.88, "Arc-Easy": 96.15, "Feqh": 37.71, "Hallucination (Truthfulness)": 66.55, "P-Hellaswag": 84.57, "Law": 44.33, "AUT Multiple Choice": 53.5, "Parsi Literature": 41.18, "BoolQA": 93.7, "Reading Comprehension": 51.85, "PartExpert": 54.37, "MMLU Pro": 47.8, "Iranian Social Norms": 73.35, "Model sha": "unknown", "Hub License": "unknown"}
19
  {"Model": "gpt-4.1-nano-2025-04-14", "#Params (B)": "unknown", "Precision": "unknown", "model_name_for_query": null, "GeneralKnowledge": 68.11, "GSM8K": 58.4, "DC-Homograph": 49.07, "MC-Homograph": 78.11, "PiQA": 84.58, "Proverb-Quiz": 67.84, "VerbEval": 66.21, "Winogrande": 60.32, "Arc-Challenge": 81.41, "Arc-Easy": 91.55, "Feqh": 32.0, "Hallucination (Truthfulness)": 51.24, "P-Hellaswag": 77.96, "Law": 32.67, "AUT Multiple Choice": 46.1, "Parsi Literature": 36.42, "BoolQA": 81.7, "Reading Comprehension": 50.66, "PartExpert": 42.49, "MMLU Pro": 29.9, "Iranian Social Norms": 74.76, "Model sha": "unknown", "Hub License": "unknown"}