|
import gradio as gr |
|
from datasets import load_dataset |
|
|
|
DATASET_NAME = "sumuks/fineweb-10BT-annotated" |
|
SPLIT = "train" |
|
|
|
SCORE_COLUMN = "score" |
|
TEXT_COLUMN = "text" |
|
ID_COLUMN = "id" |
|
|
|
|
|
try: |
|
dataset = load_dataset(DATASET_NAME, split=SPLIT) |
|
except Exception as e: |
|
dataset = None |
|
load_error = str(e) |
|
else: |
|
load_error = None |
|
|
|
|
|
def get_examples_by_score(score: int, n_examples: int = 5): |
|
if dataset is None: |
|
return [f"Dataset could not be loaded: {load_error}"] |
|
subset = dataset.filter(lambda x: x.get(SCORE_COLUMN) == score) |
|
n = min(len(subset), n_examples) |
|
examples = [] |
|
for item in subset.select(range(n)): |
|
text = item.get(TEXT_COLUMN, "") |
|
examples.append(text) |
|
if not examples: |
|
examples.append("No examples found for this score") |
|
return examples |
|
|
|
|
|
def build_tabs(): |
|
tabs = [] |
|
with gr.Tab("About"): |
|
gr.Markdown( |
|
f"# Dataset Inspector\nUsing dataset `{DATASET_NAME}`\n" |
|
) |
|
if load_error: |
|
gr.Markdown(f"**Dataset failed to load:** {load_error}") |
|
for score in range(6): |
|
with gr.Tab(f"Score {score}"): |
|
examples = get_examples_by_score(score, 2) |
|
for i, example in enumerate(examples): |
|
gr.Markdown(f"### Example {i+1}\n{example}") |
|
return tabs |
|
|
|
|
|
with gr.Blocks(theme="default") as demo: |
|
build_tabs() |
|
|
|
demo.launch() |