File size: 1,437 Bytes
641f594
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4fdbee0
0043673
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import gradio as gr
from datasets import load_dataset

DATASET_NAME = "sumuks/fineweb-10BT-annotated"
SPLIT = "train"

SCORE_COLUMN = "score"
TEXT_COLUMN = "text"
ID_COLUMN = "id"

# Load the dataset once when the app starts
try:
    dataset = load_dataset(DATASET_NAME, split=SPLIT)
except Exception as e:
    dataset = None
    load_error = str(e)
else:
    load_error = None


def get_examples_by_score(score: int, n_examples: int = 5):
    if dataset is None:
        return [f"Dataset could not be loaded: {load_error}"]
    subset = dataset.filter(lambda x: x.get(SCORE_COLUMN) == score)
    n = min(len(subset), n_examples)
    examples = []
    for item in subset.select(range(n)):
        text = item.get(TEXT_COLUMN, "")
        examples.append(text)
    if not examples:
        examples.append("No examples found for this score")
    return examples


def build_tabs():
    tabs = []
    with gr.Tab("About"):
        gr.Markdown(
            f"# Dataset Inspector\nUsing dataset `{DATASET_NAME}`\n"
        )
        if load_error:
            gr.Markdown(f"**Dataset failed to load:** {load_error}")
    for score in range(6):
        with gr.Tab(f"Score {score}"):
            examples = get_examples_by_score(score, 2)
            for i, example in enumerate(examples):
                gr.Markdown(f"### Example {i+1}\n{example}")
    return tabs


with gr.Blocks(theme="default") as demo:
    build_tabs()

demo.launch()