|
import gradio as gr |
|
from datasets import load_dataset |
|
import random |
|
|
|
|
|
DATASETS = { |
|
"Main Dataset": "sumuks/fineweb-10BT-annotated", |
|
"Ablation Dataset": "sumuks/fineweb-10BT-annotated-ablation-1" |
|
} |
|
|
|
SPLIT = "train" |
|
|
|
|
|
SCORE_COLUMN = "score" |
|
TEXT_COLUMN = "text" |
|
ID_COLUMN = "id" |
|
SUMMARY_COLUMN = "summary" |
|
JUSTIFICATION_COLUMN = "justification" |
|
THINKING_COLUMN = "thinking" |
|
MODEL_COLUMN = "annotation_model" |
|
DATE_COLUMN = "annotation_date" |
|
|
|
|
|
current_dataset = None |
|
dataset_name = None |
|
seen_ids = set() |
|
|
|
def load_selected_dataset(selected_dataset): |
|
global current_dataset, dataset_name, seen_ids |
|
dataset_name = DATASETS[selected_dataset] |
|
seen_ids = set() |
|
|
|
try: |
|
current_dataset = load_dataset(dataset_name, split=SPLIT) |
|
return f"β
Loaded {len(current_dataset)} examples from {dataset_name}" |
|
except Exception as e: |
|
current_dataset = None |
|
return f"β Failed to load {dataset_name}: {str(e)}" |
|
|
|
def get_examples_by_score(score: int, n_examples: int = 5, show_details: bool = False): |
|
if current_dataset is None: |
|
return "Please select and load a dataset first." |
|
|
|
subset = current_dataset.filter(lambda x: x.get(SCORE_COLUMN) == score) |
|
if len(subset) == 0: |
|
return "No examples found for this score." |
|
|
|
n = min(len(subset), n_examples) |
|
examples_text = [] |
|
|
|
|
|
total_available = len(subset) |
|
random_indices = random.sample(range(total_available), n) |
|
|
|
for idx in random_indices: |
|
item = subset[idx] |
|
example_id = item.get(ID_COLUMN, "Unknown") |
|
text = item.get(TEXT_COLUMN, "") |
|
summary = item.get(SUMMARY_COLUMN, "") |
|
justification = item.get(JUSTIFICATION_COLUMN, "") |
|
thinking = item.get(THINKING_COLUMN, "") |
|
model = item.get(MODEL_COLUMN, "") |
|
date = item.get(DATE_COLUMN, "") |
|
|
|
|
|
example_display = f"**Document ID:** {example_id}\n\n" |
|
|
|
if show_details and summary: |
|
example_display += f"**Summary:** {summary}\n\n" |
|
|
|
if show_details and justification: |
|
example_display += f"**Justification:** {justification}\n\n" |
|
|
|
if show_details and thinking: |
|
example_display += f"**Thinking Process:** {thinking}\n\n" |
|
|
|
if show_details and model: |
|
example_display += f"**Model:** {model} | **Date:** {date}\n\n" |
|
|
|
example_display += f"**Text:**\n{text}\n\n---\n" |
|
examples_text.append(example_display) |
|
|
|
return "\n".join(examples_text) |
|
|
|
def get_random_unseen_example(show_details: bool = False): |
|
if current_dataset is None: |
|
return "Please select and load a dataset first." |
|
|
|
|
|
all_ids = set(current_dataset[ID_COLUMN]) |
|
unseen_ids = all_ids - seen_ids |
|
|
|
if not unseen_ids: |
|
|
|
seen_ids.clear() |
|
unseen_ids = all_ids |
|
if not unseen_ids: |
|
return "No examples available in dataset." |
|
|
|
|
|
random_id = random.choice(list(unseen_ids)) |
|
seen_ids.add(random_id) |
|
|
|
|
|
item_idx = current_dataset[ID_COLUMN].index(random_id) |
|
item = current_dataset[item_idx] |
|
|
|
|
|
text = item.get(TEXT_COLUMN, "") |
|
score = item.get(SCORE_COLUMN, "N/A") |
|
summary = item.get(SUMMARY_COLUMN, "") |
|
justification = item.get(JUSTIFICATION_COLUMN, "") |
|
thinking = item.get(THINKING_COLUMN, "") |
|
model = item.get(MODEL_COLUMN, "") |
|
date = item.get(DATE_COLUMN, "") |
|
|
|
|
|
display = f"**Document ID:** {random_id} | **Score:** {score}\n\n" |
|
|
|
if show_details and summary: |
|
display += f"**Summary:** {summary}\n\n" |
|
|
|
if show_details and justification: |
|
display += f"**Justification:** {justification}\n\n" |
|
|
|
if show_details and thinking: |
|
display += f"**Thinking Process:** {thinking}\n\n" |
|
|
|
if show_details and model: |
|
display += f"**Model:** {model} | **Date:** {date}\n\n" |
|
|
|
display += f"**Text:**\n{text}" |
|
|
|
return display |
|
|
|
def build_interface(): |
|
with gr.Blocks(theme="default", title="Dataset Inspector") as demo: |
|
gr.Markdown("# π Expert Content Classification Dataset Inspector") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
dataset_dropdown = gr.Dropdown( |
|
choices=list(DATASETS.keys()), |
|
label="Select Dataset", |
|
value="Main Dataset" |
|
) |
|
with gr.Column(scale=1): |
|
load_btn = gr.Button("Load Dataset", variant="primary") |
|
|
|
status_display = gr.Markdown("") |
|
|
|
with gr.Row(): |
|
show_details_global = gr.Checkbox( |
|
label="Show annotation details (summary, justification, thinking)", |
|
value=False |
|
) |
|
|
|
with gr.Tabs(): |
|
|
|
with gr.Tab("π² Random Sampling"): |
|
gr.Markdown("Sample random examples you haven't seen before") |
|
with gr.Row(): |
|
sample_btn = gr.Button("Get Random Example", variant="secondary", size="lg") |
|
random_output = gr.Markdown("") |
|
|
|
|
|
for score in range(6): |
|
with gr.Tab(f"β Score {score}"): |
|
gr.Markdown(f"Browse examples with quality score {score}") |
|
with gr.Row(): |
|
n_examples = gr.Slider( |
|
minimum=1, |
|
maximum=20, |
|
value=3, |
|
step=1, |
|
label="Number of examples" |
|
) |
|
show_btn = gr.Button(f"Show Score {score} Examples", variant="secondary") |
|
|
|
score_output = gr.Markdown("") |
|
|
|
|
|
show_btn.click( |
|
fn=lambda n, details, s=score: get_examples_by_score(s, n, details), |
|
inputs=[n_examples, show_details_global], |
|
outputs=score_output |
|
) |
|
|
|
|
|
load_btn.click( |
|
fn=load_selected_dataset, |
|
inputs=dataset_dropdown, |
|
outputs=status_display |
|
) |
|
|
|
sample_btn.click( |
|
fn=get_random_unseen_example, |
|
inputs=show_details_global, |
|
outputs=random_output |
|
) |
|
|
|
|
|
demo.load( |
|
fn=lambda: load_selected_dataset("Main Dataset"), |
|
outputs=status_display |
|
) |
|
|
|
return demo |
|
|
|
if __name__ == "__main__": |
|
demo = build_interface() |
|
demo.launch() |