import gradio as gr from datasets import load_dataset import random # Available datasets DATASETS = { "Main Dataset": "sumuks/fineweb-10BT-annotated", "Ablation Dataset": "sumuks/fineweb-10BT-annotated-ablation-1" } SPLIT = "train" # Column names (from build.py) SCORE_COLUMN = "score" TEXT_COLUMN = "text" ID_COLUMN = "id" SUMMARY_COLUMN = "summary" JUSTIFICATION_COLUMN = "justification" THINKING_COLUMN = "thinking" MODEL_COLUMN = "annotation_model" DATE_COLUMN = "annotation_date" # Global state current_dataset = None dataset_name = None seen_ids = set() def load_selected_dataset(selected_dataset): global current_dataset, dataset_name, seen_ids dataset_name = DATASETS[selected_dataset] seen_ids = set() # Reset seen examples when switching datasets try: current_dataset = load_dataset(dataset_name, split=SPLIT) return f"✅ Loaded {len(current_dataset)} examples from {dataset_name}" except Exception as e: current_dataset = None return f"❌ Failed to load {dataset_name}: {str(e)}" def get_examples_by_score(score: int, n_examples: int = 5, show_details: bool = False): if current_dataset is None: return "Please select and load a dataset first." subset = current_dataset.filter(lambda x: x.get(SCORE_COLUMN) == score) if len(subset) == 0: return "No examples found for this score." n = min(len(subset), n_examples) examples_text = [] # Randomly sample indices instead of taking the first n total_available = len(subset) random_indices = random.sample(range(total_available), n) for idx in random_indices: item = subset[idx] example_id = item.get(ID_COLUMN, "Unknown") text = item.get(TEXT_COLUMN, "") summary = item.get(SUMMARY_COLUMN, "") justification = item.get(JUSTIFICATION_COLUMN, "") thinking = item.get(THINKING_COLUMN, "") model = item.get(MODEL_COLUMN, "") date = item.get(DATE_COLUMN, "") # Build the example display example_display = f"**Document ID:** {example_id}\n\n" if show_details and summary: example_display += f"**Summary:** {summary}\n\n" if show_details and justification: example_display += f"**Justification:** {justification}\n\n" if show_details and thinking: example_display += f"**Thinking Process:** {thinking}\n\n" if show_details and model: example_display += f"**Model:** {model} | **Date:** {date}\n\n" example_display += f"**Text:**\n{text}\n\n---\n" examples_text.append(example_display) return "\n".join(examples_text) def get_random_unseen_example(show_details: bool = False): if current_dataset is None: return "Please select and load a dataset first." # Get all IDs we haven't seen all_ids = set(current_dataset[ID_COLUMN]) unseen_ids = all_ids - seen_ids if not unseen_ids: # Reset if we've seen everything seen_ids.clear() unseen_ids = all_ids if not unseen_ids: return "No examples available in dataset." # Pick random unseen ID random_id = random.choice(list(unseen_ids)) seen_ids.add(random_id) # Find the item with this ID item_idx = current_dataset[ID_COLUMN].index(random_id) item = current_dataset[item_idx] # Extract data text = item.get(TEXT_COLUMN, "") score = item.get(SCORE_COLUMN, "N/A") summary = item.get(SUMMARY_COLUMN, "") justification = item.get(JUSTIFICATION_COLUMN, "") thinking = item.get(THINKING_COLUMN, "") model = item.get(MODEL_COLUMN, "") date = item.get(DATE_COLUMN, "") # Build display display = f"**Document ID:** {random_id} | **Score:** {score}\n\n" if show_details and summary: display += f"**Summary:** {summary}\n\n" if show_details and justification: display += f"**Justification:** {justification}\n\n" if show_details and thinking: display += f"**Thinking Process:** {thinking}\n\n" if show_details and model: display += f"**Model:** {model} | **Date:** {date}\n\n" display += f"**Text:**\n{text}" return display def build_interface(): with gr.Blocks(theme="default", title="Dataset Inspector") as demo: gr.Markdown("# 📊 Expert Content Classification Dataset Inspector") with gr.Row(): with gr.Column(scale=2): dataset_dropdown = gr.Dropdown( choices=list(DATASETS.keys()), label="Select Dataset", value="Main Dataset" ) with gr.Column(scale=1): load_btn = gr.Button("Load Dataset", variant="primary") status_display = gr.Markdown("") with gr.Row(): show_details_global = gr.Checkbox( label="Show annotation details (summary, justification, thinking)", value=False ) with gr.Tabs(): # Random sampling tab with gr.Tab("🎲 Random Sampling"): gr.Markdown("Sample random examples you haven't seen before") with gr.Row(): sample_btn = gr.Button("Get Random Example", variant="secondary", size="lg") random_output = gr.Markdown("") # Score-based browsing tabs for score in range(6): with gr.Tab(f"⭐ Score {score}"): gr.Markdown(f"Browse examples with quality score {score}") with gr.Row(): n_examples = gr.Slider( minimum=1, maximum=20, value=3, step=1, label="Number of examples" ) show_btn = gr.Button(f"Show Score {score} Examples", variant="secondary") score_output = gr.Markdown("") # Set up the click handler for this score show_btn.click( fn=lambda n, details, s=score: get_examples_by_score(s, n, details), inputs=[n_examples, show_details_global], outputs=score_output ) # Event handlers load_btn.click( fn=load_selected_dataset, inputs=dataset_dropdown, outputs=status_display ) sample_btn.click( fn=get_random_unseen_example, inputs=show_details_global, outputs=random_output ) # Load default dataset on startup demo.load( fn=lambda: load_selected_dataset("Main Dataset"), outputs=status_display ) return demo if __name__ == "__main__": demo = build_interface() demo.launch()