Narsil HF Staff commited on
Commit
e2152af
·
unverified ·
1 Parent(s): 378cb5f
Files changed (6) hide show
  1. README.md +60 -5
  2. app.py +570 -0
  3. flake.lock +27 -0
  4. flake.nix +41 -0
  5. get_popular_eval_datasets.py +100 -0
  6. requirements.txt +5 -0
README.md CHANGED
@@ -1,12 +1,67 @@
1
  ---
2
- title: Eval Playground
3
- emoji: 🌖
4
- colorFrom: pink
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 5.38.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Evaluation Dataset Quiz
3
+ emoji: 🧠
4
+ colorFrom: blue
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.19.2
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
+ # HuggingFace Evaluation Dataset Quiz
14
+
15
+ Test your knowledge with questions from popular evaluation datasets!
16
+
17
+ ## Features
18
+
19
+ - 🎯 Interactive quiz interface built with Gradio
20
+ - 📊 8 popular evaluation datasets including:
21
+ - GSM8K (Grade School Math)
22
+ - MMLU (Massive Multitask Language Understanding)
23
+ - AI2 ARC (Science Questions)
24
+ - HellaSwag (Commonsense NLI)
25
+ - WinoGrande (Winograd Schema)
26
+ - BoolQ (Boolean Questions)
27
+ - SQuAD (Reading Comprehension)
28
+ - PIQA (Physical Reasoning)
29
+ - 🎲 Random question selection
30
+ - ✅ Immediate feedback on answers
31
+ - 📈 Score tracking
32
+ - 🔄 Support for multiple question formats:
33
+ - Multiple choice
34
+ - True/False
35
+ - Text input for QA tasks
36
+
37
+ ## How to Use
38
+
39
+ 1. **Select a Dataset**: Choose from the available evaluation datasets
40
+ 2. **Choose Number of Questions**: Select how many questions you want (5-20)
41
+ 3. **Start Quiz**: Click "Start Quiz" to begin
42
+ 4. **Answer Questions**: Select or type your answer and click "Submit Answer"
43
+ 5. **Get Feedback**: See if you got it right and learn the correct answer
44
+ 6. **Continue**: Click "Next Question" to proceed
45
+ 7. **View Score**: See your final score at the end
46
+
47
+ ## Local Development
48
+
49
+ ```bash
50
+ # Clone the repository
51
+ git clone <your-repo-url>
52
+ cd eval_quiz_app
53
+
54
+ # Install dependencies
55
+ pip install -r requirements.txt
56
+
57
+ # Run the app
58
+ python app.py
59
+ ```
60
+
61
+ ## Deployment
62
+
63
+ This app is designed to run on HuggingFace Spaces. Simply push to your Space repository and it will deploy automatically.
64
+
65
+ ## Contributing
66
+
67
+ Feel free to add more datasets or improve the quiz functionality!
app.py ADDED
@@ -0,0 +1,570 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from datasets import load_dataset, get_dataset_config_names
3
+ import random
4
+ from typing import List, Tuple
5
+ import logging
6
+
7
+ # Set up logging
8
+ logging.basicConfig(
9
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
10
+ )
11
+
12
+ # Popular evaluation datasets with their configurations
13
+ EVAL_DATASETS = {
14
+ "openai/gsm8k": {
15
+ "name": "GSM8K - Grade School Math",
16
+ "type": "qa",
17
+ "config": "main",
18
+ "question_field": "question",
19
+ "answer_field": "answer",
20
+ "split": "train",
21
+ },
22
+ "cais/mmlu": {
23
+ "name": "MMLU - Massive Multitask Language Understanding",
24
+ "type": "multiple_choice",
25
+ "config": "all",
26
+ "question_field": "question",
27
+ "choices_field": "choices",
28
+ "answer_field": "answer",
29
+ "split": "test",
30
+ },
31
+ "allenai/ai2_arc": {
32
+ "name": "AI2 ARC - Science Questions",
33
+ "type": "multiple_choice",
34
+ "config": "ARC-Challenge",
35
+ "question_field": "question",
36
+ "choices_field": "choices",
37
+ "answer_field": "answerKey",
38
+ "split": "train",
39
+ },
40
+ "Rowan/hellaswag": {
41
+ "name": "HellaSwag - Commonsense NLI",
42
+ "type": "multiple_choice",
43
+ "question_field": "ctx",
44
+ "choices_field": "endings",
45
+ "answer_field": "label",
46
+ "split": "train",
47
+ },
48
+ "allenai/winogrande": {
49
+ "name": "WinoGrande - Winograd Schema",
50
+ "type": "binary_choice",
51
+ "config": "winogrande_xl",
52
+ "question_field": "sentence",
53
+ "option1_field": "option1",
54
+ "option2_field": "option2",
55
+ "answer_field": "answer",
56
+ "split": "train",
57
+ },
58
+ "google/boolq": {
59
+ "name": "BoolQ - Boolean Questions",
60
+ "type": "true_false",
61
+ "question_field": "question",
62
+ "context_field": "passage",
63
+ "answer_field": "answer",
64
+ "split": "train",
65
+ },
66
+ "rajpurkar/squad": {
67
+ "name": "SQuAD - Reading Comprehension",
68
+ "type": "extractive_qa",
69
+ "question_field": "question",
70
+ "context_field": "context",
71
+ "answer_field": "answers",
72
+ "split": "train",
73
+ },
74
+ "allenai/piqa": {
75
+ "name": "PIQA - Physical Reasoning",
76
+ "type": "binary_choice",
77
+ "question_field": "goal",
78
+ "option1_field": "sol1",
79
+ "option2_field": "sol2",
80
+ "answer_field": "label",
81
+ "split": "train",
82
+ },
83
+ }
84
+
85
+
86
+ class QuizApp:
87
+ def __init__(self):
88
+ self.current_dataset = None
89
+ self.current_dataset_name = None
90
+ self.questions = []
91
+ self.current_question_idx = 0
92
+ self.score = 0
93
+ self.total_questions = 0
94
+
95
+ def load_dataset_questions(self, dataset_name: str, num_questions: int = 10):
96
+ """Load random questions from the selected dataset"""
97
+ try:
98
+ config = EVAL_DATASETS[dataset_name]
99
+
100
+ # Try to load dataset with config if specified
101
+ try:
102
+ if "config" in config:
103
+ dataset = load_dataset(
104
+ dataset_name, config["config"], split=config["split"]
105
+ )
106
+ else:
107
+ dataset = load_dataset(dataset_name, split=config["split"])
108
+ except ValueError as e:
109
+ # If config is missing, try to get available configs
110
+ if "Config name is missing" in str(e):
111
+ configs = get_dataset_config_names(dataset_name)
112
+ # Use first config or "all" if available
113
+ if "all" in configs:
114
+ selected_config = "all"
115
+ else:
116
+ selected_config = configs[0]
117
+ print(
118
+ f"Auto-selected config '{selected_config}' for {dataset_name}"
119
+ )
120
+ dataset = load_dataset(
121
+ dataset_name, selected_config, split=config["split"]
122
+ )
123
+ else:
124
+ raise e
125
+
126
+ # Sample random questions
127
+ total_examples = len(dataset)
128
+ num_questions = min(num_questions, total_examples)
129
+ indices = random.sample(range(total_examples), num_questions)
130
+
131
+ self.questions = []
132
+ for idx in indices:
133
+ example = dataset[idx]
134
+ self.questions.append(example)
135
+
136
+ self.current_dataset = config
137
+ self.current_dataset_name = dataset_name
138
+ self.current_question_idx = 0
139
+ self.score = 0
140
+ self.total_questions = len(self.questions)
141
+
142
+ return True, f"Loaded {num_questions} questions from {config['name']}"
143
+
144
+ except Exception as e:
145
+ return False, f"Error loading dataset: {str(e)}"
146
+
147
+ def get_current_question(self) -> Tuple[str, List[str], str]:
148
+ """Get the current question formatted for display"""
149
+ if not self.questions or self.current_question_idx >= len(self.questions):
150
+ return "", [], ""
151
+
152
+ question_data = self.questions[self.current_question_idx]
153
+ config = self.current_dataset
154
+
155
+ logging.info(f"\n{'=' * 60}")
156
+ logging.info(f"Dataset: {self.current_dataset_name}")
157
+ logging.info(f"Question {self.current_question_idx + 1}/{self.total_questions}")
158
+ logging.info(f"Raw question data: {repr(question_data)}")
159
+ logging.info(f"{'=' * 60}\n")
160
+
161
+ # Format question based on dataset type
162
+ question_type = config["type"]
163
+
164
+ if question_type == "multiple_choice":
165
+ question = question_data[config["question_field"]]
166
+ choices = question_data[config["choices_field"]]
167
+ if config["answer_field"] in question_data:
168
+ answer = question_data[config["answer_field"]]
169
+ else:
170
+ answer = ""
171
+
172
+ # Format choices with letters
173
+ formatted_choices = [
174
+ f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)
175
+ ]
176
+ return question, formatted_choices, question_type
177
+
178
+ elif question_type == "true_false":
179
+ question = question_data[config["question_field"]]
180
+ if "context_field" in config:
181
+ context = question_data[config["context_field"]]
182
+ question = f"Context: {context}\n\nQuestion: {question}"
183
+ return question, ["True", "False"], question_type
184
+
185
+ elif question_type == "binary_choice":
186
+ question = question_data[config["question_field"]]
187
+ option1 = question_data[config["option1_field"]]
188
+ option2 = question_data[config["option2_field"]]
189
+ return question, [f"A. {option1}", f"B. {option2}"], question_type
190
+
191
+ elif question_type == "qa" or question_type == "extractive_qa":
192
+ question = question_data[config["question_field"]]
193
+ if "context_field" in config and config["context_field"] in question_data:
194
+ context = question_data[config["context_field"]]
195
+ question = f"Context: {context[:500]}...\n\nQuestion: {question}"
196
+ return question, [], question_type
197
+
198
+ return "", [], ""
199
+
200
+ def format_answer(self, answer: str, dataset_name: str) -> str:
201
+ """Format answer based on dataset type for better readability"""
202
+ if dataset_name == "openai/gsm8k":
203
+ # GSM8K has specific formatting with equations and final answer
204
+ # Replace <<...>> with proper math formatting
205
+ import re
206
+
207
+ # Convert <<equation>> to LaTeX
208
+ answer = re.sub(r"<<([^>]+)>>", r"$\\1$", answer)
209
+ # Format the final answer line
210
+ answer = answer.replace("####", "\n\n**Final Answer:**")
211
+ # Ensure proper line breaks
212
+ answer = answer.replace(". ", ".\n")
213
+ return answer
214
+ elif dataset_name == "cais/mmlu":
215
+ # MMLU answers are usually single letters or short phrases
216
+ return answer
217
+ elif dataset_name == "rajpurkar/squad":
218
+ # SQuAD answers might need context
219
+ return answer
220
+ else:
221
+ # Default formatting for other datasets
222
+ return answer
223
+
224
+ def check_answer(self, user_answer: str) -> Tuple[bool, str]:
225
+ """Check if the user's answer is correct"""
226
+ if not self.questions or self.current_question_idx >= len(self.questions):
227
+ return False, "No question available"
228
+
229
+ question_data = self.questions[self.current_question_idx]
230
+ config = self.current_dataset
231
+ question_type = config["type"]
232
+
233
+ if question_type == "multiple_choice":
234
+ correct_answer_idx = question_data[config["answer_field"]]
235
+ # Handle both numeric and letter answers
236
+ if isinstance(correct_answer_idx, int):
237
+ correct_letter = chr(65 + correct_answer_idx)
238
+ else:
239
+ correct_letter = str(correct_answer_idx)
240
+
241
+ user_letter = user_answer.strip().upper()[0] if user_answer else ""
242
+ is_correct = user_letter == correct_letter
243
+
244
+ if is_correct:
245
+ return True, "✅ **Correct!**"
246
+ else:
247
+ choices = question_data[config["choices_field"]]
248
+ correct_choice = (
249
+ choices[correct_answer_idx]
250
+ if isinstance(correct_answer_idx, int)
251
+ else correct_answer_idx
252
+ )
253
+ logging.info(f"Raw answer (multiple choice): {repr(correct_choice)}")
254
+ formatted_answer = self.format_answer(
255
+ correct_choice, self.current_dataset_name
256
+ )
257
+ return (
258
+ False,
259
+ f"❌ **Incorrect**\n\nThe correct answer was **{correct_letter}**:\n\n{formatted_answer}",
260
+ )
261
+
262
+ elif question_type == "true_false":
263
+ correct_answer = question_data[config["answer_field"]]
264
+ user_bool = user_answer.lower().strip() == "true"
265
+ is_correct = user_bool == correct_answer
266
+
267
+ if is_correct:
268
+ return True, "✅ **Correct!**"
269
+ else:
270
+ return (
271
+ False,
272
+ f"❌ **Incorrect**\n\nThe correct answer was **{correct_answer}**",
273
+ )
274
+
275
+ elif question_type == "binary_choice":
276
+ correct_answer_idx = question_data[config["answer_field"]]
277
+ user_idx = 0 if user_answer.strip().upper().startswith("A") else 1
278
+ is_correct = user_idx == correct_answer_idx
279
+
280
+ if is_correct:
281
+ return True, "✅ **Correct!**"
282
+ else:
283
+ correct_letter = "A" if correct_answer_idx == 0 else "B"
284
+ option_field = (
285
+ config["option1_field"]
286
+ if correct_answer_idx == 0
287
+ else config["option2_field"]
288
+ )
289
+ correct_option = question_data[option_field]
290
+ logging.info(f"Raw answer (binary choice): {repr(correct_option)}")
291
+ formatted_answer = self.format_answer(
292
+ correct_option, self.current_dataset_name
293
+ )
294
+ return (
295
+ False,
296
+ f"❌ **Incorrect**\n\nThe correct answer was **{correct_letter}**:\n\n{formatted_answer}",
297
+ )
298
+
299
+ elif question_type in ["qa", "extractive_qa"]:
300
+ # For QA, we'll do a simple check - in real app, you'd want more sophisticated matching
301
+ correct_answer = question_data[config["answer_field"]]
302
+ if isinstance(correct_answer, dict) and "text" in correct_answer:
303
+ correct_answer = (
304
+ correct_answer["text"][0] if correct_answer["text"] else ""
305
+ )
306
+ elif isinstance(correct_answer, list) and len(correct_answer) > 0:
307
+ correct_answer = (
308
+ correct_answer[0]["text"]
309
+ if isinstance(correct_answer[0], dict)
310
+ else str(correct_answer[0])
311
+ )
312
+ else:
313
+ correct_answer = str(correct_answer)
314
+
315
+ # Extract final answer for GSM8K and similar datasets
316
+ import re
317
+
318
+ # For GSM8K, extract the final answer after ####
319
+ if "####" in correct_answer:
320
+ final_answer_match = re.search(r"####\s*(.+)", correct_answer)
321
+ if final_answer_match:
322
+ final_answer = final_answer_match.group(1).strip()
323
+ else:
324
+ final_answer = correct_answer
325
+ else:
326
+ final_answer = correct_answer
327
+
328
+ # Extract numbers from both answers for comparison
329
+ correct_numbers = re.findall(r"-?\d+\.?\d*", final_answer)
330
+ user_numbers = re.findall(r"-?\d+\.?\d*", user_answer)
331
+
332
+ # Check if answers match
333
+ is_correct = False
334
+
335
+ # If both have numbers, compare the numbers
336
+ if correct_numbers and user_numbers:
337
+ # Convert to float for comparison to handle decimals
338
+ try:
339
+ correct_num = float(
340
+ correct_numbers[-1]
341
+ ) # Take the last number as final answer
342
+ user_num = float(user_numbers[-1]) # Take the last number from user
343
+ is_correct = (
344
+ abs(correct_num - user_num) < 0.0001
345
+ ) # Small tolerance for float comparison
346
+ except ValueError:
347
+ # Fall back to string comparison
348
+ is_correct = correct_numbers[-1] == user_numbers[-1]
349
+ else:
350
+ # Fall back to substring matching for non-numeric answers
351
+ is_correct = (
352
+ user_answer.lower().strip() in correct_answer.lower()
353
+ or correct_answer.lower() in user_answer.lower().strip()
354
+ )
355
+
356
+ if is_correct:
357
+ return True, "✅ **Correct!**"
358
+ else:
359
+ logging.info(f"Raw answer (QA): {repr(correct_answer)}")
360
+ logging.info(f"Extracted final answer: {repr(final_answer)}")
361
+ logging.info(
362
+ f"Correct numbers: {correct_numbers}, User numbers: {user_numbers}"
363
+ )
364
+ formatted_answer = self.format_answer(
365
+ correct_answer, self.current_dataset_name
366
+ )
367
+ return (
368
+ False,
369
+ f"❌ **Incorrect**\n\n**The correct answer was:**\n\n{formatted_answer}",
370
+ )
371
+
372
+ return False, "Unknown question type"
373
+
374
+
375
+ # Create global quiz app instance
376
+ quiz_app = QuizApp()
377
+
378
+
379
+ def create_dataset_display():
380
+ """Create the dataset listing display"""
381
+ dataset_info = []
382
+ for dataset_id, config in EVAL_DATASETS.items():
383
+ dataset_info.append(
384
+ f"**{config['name']}**\n- Dataset: {dataset_id}\n- Type: {config['type']}"
385
+ )
386
+
387
+ return "\n\n".join(dataset_info)
388
+
389
+
390
+ def start_quiz(dataset_choice: str, num_questions: int):
391
+ """Start a new quiz with the selected dataset"""
392
+ # Extract dataset ID from the choice
393
+ dataset_id = None
394
+ for did, config in EVAL_DATASETS.items():
395
+ if config["name"] in dataset_choice:
396
+ dataset_id = did
397
+ break
398
+
399
+ if not dataset_id:
400
+ return (
401
+ "Please select a dataset",
402
+ "",
403
+ "",
404
+ gr.update(visible=False),
405
+ gr.update(visible=False),
406
+ "0/0",
407
+ )
408
+
409
+ success, message = quiz_app.load_dataset_questions(dataset_id, num_questions)
410
+
411
+ if success:
412
+ question, choices, q_type = quiz_app.get_current_question()
413
+
414
+ if q_type in ["multiple_choice", "true_false", "binary_choice"]:
415
+ return (
416
+ message,
417
+ question,
418
+ gr.update(choices=choices, visible=True, value=None),
419
+ gr.update(visible=False),
420
+ gr.update(visible=True),
421
+ f"Question 1/{quiz_app.total_questions}",
422
+ )
423
+ else:
424
+ return (
425
+ message,
426
+ question,
427
+ gr.update(visible=False),
428
+ gr.update(visible=True, value=""),
429
+ gr.update(visible=True),
430
+ f"Question 1/{quiz_app.total_questions}",
431
+ )
432
+ else:
433
+ return (
434
+ message,
435
+ "",
436
+ gr.update(visible=False),
437
+ gr.update(visible=False),
438
+ gr.update(visible=False),
439
+ "0/0",
440
+ )
441
+
442
+
443
+ def submit_answer(answer_choice, answer_text):
444
+ """Submit answer and show feedback"""
445
+ # Determine which answer to use
446
+ if answer_choice:
447
+ answer = answer_choice
448
+ else:
449
+ answer = answer_text
450
+
451
+ is_correct, feedback = quiz_app.check_answer(answer)
452
+
453
+ if is_correct:
454
+ quiz_app.score += 1
455
+
456
+ return gr.update(value=feedback, visible=True), gr.update(visible=True)
457
+
458
+
459
+ def next_question():
460
+ """Move to the next question"""
461
+ quiz_app.current_question_idx += 1
462
+
463
+ if quiz_app.current_question_idx >= quiz_app.total_questions:
464
+ # Quiz complete
465
+ final_score = f"## 🎉 Quiz Complete!\n\n**Your score:** {quiz_app.score}/{quiz_app.total_questions} ({quiz_app.score / quiz_app.total_questions * 100:.1f}%)"
466
+ return (
467
+ gr.update(value=final_score, visible=True),
468
+ "",
469
+ gr.update(visible=False),
470
+ gr.update(visible=False),
471
+ gr.update(visible=False),
472
+ gr.update(visible=False),
473
+ "Quiz Complete",
474
+ )
475
+
476
+ question, choices, q_type = quiz_app.get_current_question()
477
+
478
+ if q_type in ["multiple_choice", "true_false", "binary_choice"]:
479
+ return (
480
+ gr.update(value="", visible=False), # Clear feedback
481
+ question,
482
+ gr.update(choices=choices, visible=True, value=None),
483
+ gr.update(visible=False),
484
+ gr.update(visible=True),
485
+ gr.update(visible=False),
486
+ f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}",
487
+ )
488
+ else:
489
+ return (
490
+ gr.update(value="", visible=False), # Clear feedback
491
+ question,
492
+ gr.update(visible=False),
493
+ gr.update(visible=True, value=""),
494
+ gr.update(visible=True),
495
+ gr.update(visible=False),
496
+ f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}",
497
+ )
498
+
499
+
500
+ # Create Gradio interface
501
+ with gr.Blocks(title="HuggingFace Evaluation Dataset Quiz") as demo:
502
+ gr.Markdown("# 🤗 Evaluation Dataset Quiz")
503
+ gr.Markdown(
504
+ "Test yourself with questions from popular HuggingFace evaluation datasets!"
505
+ )
506
+
507
+ with gr.Tabs():
508
+ with gr.Tab("Dataset Selection"):
509
+ with gr.Row():
510
+ dataset_dropdown = gr.Dropdown(
511
+ choices=[config["name"] for config in EVAL_DATASETS.values()],
512
+ label="Select Dataset",
513
+ value=list(EVAL_DATASETS.values())[0]["name"],
514
+ )
515
+ num_questions_slider = gr.Slider(
516
+ minimum=5, maximum=20, value=10, step=1, label="Number of Questions"
517
+ )
518
+
519
+ start_button = gr.Button("Start Quiz", variant="primary")
520
+ status_message = gr.Textbox(label="Status", interactive=False)
521
+
522
+ with gr.Tab("Quiz"):
523
+ progress_text = gr.Textbox(label="Progress", value="0/0", interactive=False)
524
+ question_display = gr.Textbox(label="Question", lines=5, interactive=False)
525
+
526
+ # Answer inputs (one will be visible at a time)
527
+ answer_radio = gr.Radio(label="Select your answer", visible=False)
528
+ answer_textbox = gr.Textbox(label="Type your answer", visible=False)
529
+
530
+ submit_button = gr.Button("Submit Answer", variant="primary", visible=False)
531
+
532
+ feedback_display = gr.Markdown(label="Feedback", visible=True)
533
+ next_button = gr.Button("Next Question", visible=False)
534
+
535
+ # Connect events
536
+ start_button.click(
537
+ start_quiz,
538
+ inputs=[dataset_dropdown, num_questions_slider],
539
+ outputs=[
540
+ status_message,
541
+ question_display,
542
+ answer_radio,
543
+ answer_textbox,
544
+ submit_button,
545
+ progress_text,
546
+ ],
547
+ )
548
+
549
+ submit_button.click(
550
+ submit_answer,
551
+ inputs=[answer_radio, answer_textbox],
552
+ outputs=[feedback_display, next_button],
553
+ )
554
+
555
+ next_button.click(
556
+ next_question,
557
+ outputs=[
558
+ feedback_display,
559
+ question_display,
560
+ answer_radio,
561
+ answer_textbox,
562
+ submit_button,
563
+ next_button,
564
+ progress_text,
565
+ ],
566
+ )
567
+
568
+ if __name__ == "__main__":
569
+ demo.launch()
570
+
flake.lock ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nodes": {
3
+ "nixpkgs": {
4
+ "locked": {
5
+ "lastModified": 1730531603,
6
+ "narHash": "sha256-Dqg6si5CqIzm87sp57j5nTaeBbWhHFaVyG7V6L8k3lY=",
7
+ "owner": "NixOS",
8
+ "repo": "nixpkgs",
9
+ "rev": "7ffd9ae656aec493492b44d0ddfb28e79a1ea25d",
10
+ "type": "github"
11
+ },
12
+ "original": {
13
+ "owner": "NixOS",
14
+ "ref": "nixos-unstable",
15
+ "repo": "nixpkgs",
16
+ "type": "github"
17
+ }
18
+ },
19
+ "root": {
20
+ "inputs": {
21
+ "nixpkgs": "nixpkgs"
22
+ }
23
+ }
24
+ },
25
+ "root": "root",
26
+ "version": 7
27
+ }
flake.nix ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ inputs = {
3
+ nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
4
+ };
5
+
6
+ outputs =
7
+ { nixpkgs, ... }:
8
+ let
9
+ forAllSystems = nixpkgs.lib.genAttrs [
10
+ "aarch64-linux"
11
+ "x86_64-linux"
12
+ "aarch64-darwin"
13
+ ];
14
+ in
15
+ {
16
+ devShells = forAllSystems (
17
+ system:
18
+ let
19
+ pkgs = nixpkgs.legacyPackages.${system};
20
+ in
21
+ {
22
+ default = pkgs.mkShell {
23
+ buildInputs = with pkgs; [
24
+ rustup
25
+ python3Packages.python
26
+ python3Packages.venvShellHook
27
+ ];
28
+ venvDir = "./.venv";
29
+ postVenvCreation = ''
30
+ unset SOURCE_DATE_EPOCH
31
+ '';
32
+ postShellHook = ''
33
+ unset SOURCE_DATE_EPOCH
34
+ '';
35
+ LD_LIBRARY_PATH = "$LD_LIBRARY_PATH:${pkgs.stdenv.cc.cc.lib}/lib:${pkgs.zlib}/lib:/run/opengl-driver/lib";
36
+ };
37
+
38
+ }
39
+ );
40
+ };
41
+ }
get_popular_eval_datasets.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to fetch the 10 most used evaluation datasets from Hugging Face.
4
+ """
5
+
6
+ import requests
7
+ from typing import List, Dict
8
+
9
+ def get_popular_eval_datasets(limit: int = 10) -> List[Dict]:
10
+ """
11
+ Fetch popular evaluation datasets from Hugging Face Hub API.
12
+
13
+ Args:
14
+ limit: Number of datasets to return
15
+
16
+ Returns:
17
+ List of dataset information dictionaries
18
+ """
19
+ # Common evaluation dataset tags and keywords
20
+ eval_keywords = [
21
+ "evaluation", "benchmark", "eval", "test-set", "validation",
22
+ "leaderboard", "assessment", "metric"
23
+ ]
24
+
25
+ # Search for datasets with evaluation-related tags
26
+ base_url = "https://huggingface.co/api/datasets"
27
+ params = {
28
+ "sort": "downloads", # Sort by most downloaded
29
+ "direction": "-1", # Descending order
30
+ "limit": 100, # Get more to filter
31
+ "full": "true"
32
+ }
33
+
34
+ response = requests.get(base_url, params=params)
35
+ response.raise_for_status()
36
+
37
+ datasets = response.json()
38
+
39
+ # Filter for evaluation datasets
40
+ eval_datasets = []
41
+ for dataset in datasets:
42
+ # Check if dataset has evaluation-related tags or is commonly used for eval
43
+ tags = dataset.get("tags", [])
44
+ dataset_id = dataset.get("id", "").lower()
45
+
46
+ # Check for eval keywords in tags or dataset name
47
+ is_eval = any(
48
+ any(keyword in str(tag).lower() for keyword in eval_keywords)
49
+ for tag in tags
50
+ ) or any(keyword in dataset_id for keyword in eval_keywords)
51
+
52
+ # Also include well-known evaluation datasets
53
+ known_eval_datasets = [
54
+ "glue", "superglue", "squad", "xnli", "hellaswag", "winogrande",
55
+ "arc", "mmlu", "gsm8k", "humaneval", "mbpp", "truthfulqa",
56
+ "bigbench", "c4", "piqa", "siqa", "boolq", "copa", "multirc",
57
+ "record", "rte", "wic", "wsc", "cb", "axb", "axg", "swag",
58
+ "race", "qnli", "wnli", "sst", "cola", "stsb", "mrpc", "qqp"
59
+ ]
60
+
61
+ if any(known in dataset_id for known in known_eval_datasets):
62
+ is_eval = True
63
+
64
+ if is_eval:
65
+ eval_datasets.append({
66
+ "name": dataset.get("id", ""),
67
+ "downloads": dataset.get("downloads", 0),
68
+ "likes": dataset.get("likes", 0),
69
+ "tags": [tag for tag in tags if isinstance(tag, str)][:5], # First 5 tags
70
+ "description": dataset.get("description", "")[:200] # First 200 chars
71
+ })
72
+
73
+ # Sort by downloads and return top N
74
+ eval_datasets.sort(key=lambda x: x["downloads"], reverse=True)
75
+ return eval_datasets[:limit]
76
+
77
+ def main():
78
+ """Main function to fetch and display popular evaluation datasets."""
79
+ print("Fetching the 10 most used evaluation datasets from Hugging Face...\n")
80
+
81
+ try:
82
+ datasets = get_popular_eval_datasets(10)
83
+
84
+ for i, dataset in enumerate(datasets, 1):
85
+ print(f"{i}. {dataset['name']}")
86
+ print(f" Downloads: {dataset['downloads']:,}")
87
+ print(f" Likes: {dataset['likes']}")
88
+ if dataset['tags']:
89
+ print(f" Tags: {', '.join(dataset['tags'])}")
90
+ if dataset['description']:
91
+ print(f" Description: {dataset['description']}...")
92
+ print()
93
+
94
+ except requests.exceptions.RequestException as e:
95
+ print(f"Error fetching data from Hugging Face: {e}")
96
+ except Exception as e:
97
+ print(f"An error occurred: {e}")
98
+
99
+ if __name__ == "__main__":
100
+ main()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ datasets
3
+ transformers
4
+ requests
5
+ huggingface-hub