sumuks's picture
sumuks HF Staff
Update app.py
5455df8 verified
import gradio as gr
from datasets import load_dataset
import random
# Available datasets
DATASETS = {
"Main Dataset": "sumuks/fineweb-10BT-annotated",
"Ablation Dataset": "sumuks/fineweb-10BT-annotated-ablation-1"
}
SPLIT = "train"
# Column names (from build.py)
SCORE_COLUMN = "score"
TEXT_COLUMN = "text"
ID_COLUMN = "id"
SUMMARY_COLUMN = "summary"
JUSTIFICATION_COLUMN = "justification"
THINKING_COLUMN = "thinking"
MODEL_COLUMN = "annotation_model"
DATE_COLUMN = "annotation_date"
# Global state
current_dataset = None
dataset_name = None
seen_ids = set()
def load_selected_dataset(selected_dataset):
global current_dataset, dataset_name, seen_ids
dataset_name = DATASETS[selected_dataset]
seen_ids = set() # Reset seen examples when switching datasets
try:
current_dataset = load_dataset(dataset_name, split=SPLIT)
return f"βœ… Loaded {len(current_dataset)} examples from {dataset_name}"
except Exception as e:
current_dataset = None
return f"❌ Failed to load {dataset_name}: {str(e)}"
def get_examples_by_score(score: int, n_examples: int = 5, show_details: bool = False):
if current_dataset is None:
return "Please select and load a dataset first."
subset = current_dataset.filter(lambda x: x.get(SCORE_COLUMN) == score)
if len(subset) == 0:
return "No examples found for this score."
n = min(len(subset), n_examples)
examples_text = []
# Randomly sample indices instead of taking the first n
total_available = len(subset)
random_indices = random.sample(range(total_available), n)
for idx in random_indices:
item = subset[idx]
example_id = item.get(ID_COLUMN, "Unknown")
text = item.get(TEXT_COLUMN, "")
summary = item.get(SUMMARY_COLUMN, "")
justification = item.get(JUSTIFICATION_COLUMN, "")
thinking = item.get(THINKING_COLUMN, "")
model = item.get(MODEL_COLUMN, "")
date = item.get(DATE_COLUMN, "")
# Build the example display
example_display = f"**Document ID:** {example_id}\n\n"
if show_details and summary:
example_display += f"**Summary:** {summary}\n\n"
if show_details and justification:
example_display += f"**Justification:** {justification}\n\n"
if show_details and thinking:
example_display += f"**Thinking Process:** {thinking}\n\n"
if show_details and model:
example_display += f"**Model:** {model} | **Date:** {date}\n\n"
example_display += f"**Text:**\n{text}\n\n---\n"
examples_text.append(example_display)
return "\n".join(examples_text)
def get_random_unseen_example(show_details: bool = False):
if current_dataset is None:
return "Please select and load a dataset first."
# Get all IDs we haven't seen
all_ids = set(current_dataset[ID_COLUMN])
unseen_ids = all_ids - seen_ids
if not unseen_ids:
# Reset if we've seen everything
seen_ids.clear()
unseen_ids = all_ids
if not unseen_ids:
return "No examples available in dataset."
# Pick random unseen ID
random_id = random.choice(list(unseen_ids))
seen_ids.add(random_id)
# Find the item with this ID
item_idx = current_dataset[ID_COLUMN].index(random_id)
item = current_dataset[item_idx]
# Extract data
text = item.get(TEXT_COLUMN, "")
score = item.get(SCORE_COLUMN, "N/A")
summary = item.get(SUMMARY_COLUMN, "")
justification = item.get(JUSTIFICATION_COLUMN, "")
thinking = item.get(THINKING_COLUMN, "")
model = item.get(MODEL_COLUMN, "")
date = item.get(DATE_COLUMN, "")
# Build display
display = f"**Document ID:** {random_id} | **Score:** {score}\n\n"
if show_details and summary:
display += f"**Summary:** {summary}\n\n"
if show_details and justification:
display += f"**Justification:** {justification}\n\n"
if show_details and thinking:
display += f"**Thinking Process:** {thinking}\n\n"
if show_details and model:
display += f"**Model:** {model} | **Date:** {date}\n\n"
display += f"**Text:**\n{text}"
return display
def build_interface():
with gr.Blocks(theme="default", title="Dataset Inspector") as demo:
gr.Markdown("# πŸ“Š Expert Content Classification Dataset Inspector")
with gr.Row():
with gr.Column(scale=2):
dataset_dropdown = gr.Dropdown(
choices=list(DATASETS.keys()),
label="Select Dataset",
value="Main Dataset"
)
with gr.Column(scale=1):
load_btn = gr.Button("Load Dataset", variant="primary")
status_display = gr.Markdown("")
with gr.Row():
show_details_global = gr.Checkbox(
label="Show annotation details (summary, justification, thinking)",
value=False
)
with gr.Tabs():
# Random sampling tab
with gr.Tab("🎲 Random Sampling"):
gr.Markdown("Sample random examples you haven't seen before")
with gr.Row():
sample_btn = gr.Button("Get Random Example", variant="secondary", size="lg")
random_output = gr.Markdown("")
# Score-based browsing tabs
for score in range(6):
with gr.Tab(f"⭐ Score {score}"):
gr.Markdown(f"Browse examples with quality score {score}")
with gr.Row():
n_examples = gr.Slider(
minimum=1,
maximum=20,
value=3,
step=1,
label="Number of examples"
)
show_btn = gr.Button(f"Show Score {score} Examples", variant="secondary")
score_output = gr.Markdown("")
# Set up the click handler for this score
show_btn.click(
fn=lambda n, details, s=score: get_examples_by_score(s, n, details),
inputs=[n_examples, show_details_global],
outputs=score_output
)
# Event handlers
load_btn.click(
fn=load_selected_dataset,
inputs=dataset_dropdown,
outputs=status_display
)
sample_btn.click(
fn=get_random_unseen_example,
inputs=show_details_global,
outputs=random_output
)
# Load default dataset on startup
demo.load(
fn=lambda: load_selected_dataset("Main Dataset"),
outputs=status_display
)
return demo
if __name__ == "__main__":
demo = build_interface()
demo.launch()