File size: 7,181 Bytes
641f594 5455df8 641f594 5455df8 641f594 5455df8 641f594 5455df8 641f594 5455df8 641f594 5455df8 641f594 5455df8 641f594 5455df8 641f594 5455df8 641f594 5455df8 641f594 5455df8 0043673 5455df8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
import gradio as gr
from datasets import load_dataset
import random
# Available datasets
DATASETS = {
"Main Dataset": "sumuks/fineweb-10BT-annotated",
"Ablation Dataset": "sumuks/fineweb-10BT-annotated-ablation-1"
}
SPLIT = "train"
# Column names (from build.py)
SCORE_COLUMN = "score"
TEXT_COLUMN = "text"
ID_COLUMN = "id"
SUMMARY_COLUMN = "summary"
JUSTIFICATION_COLUMN = "justification"
THINKING_COLUMN = "thinking"
MODEL_COLUMN = "annotation_model"
DATE_COLUMN = "annotation_date"
# Global state
current_dataset = None
dataset_name = None
seen_ids = set()
def load_selected_dataset(selected_dataset):
global current_dataset, dataset_name, seen_ids
dataset_name = DATASETS[selected_dataset]
seen_ids = set() # Reset seen examples when switching datasets
try:
current_dataset = load_dataset(dataset_name, split=SPLIT)
return f"β
Loaded {len(current_dataset)} examples from {dataset_name}"
except Exception as e:
current_dataset = None
return f"β Failed to load {dataset_name}: {str(e)}"
def get_examples_by_score(score: int, n_examples: int = 5, show_details: bool = False):
if current_dataset is None:
return "Please select and load a dataset first."
subset = current_dataset.filter(lambda x: x.get(SCORE_COLUMN) == score)
if len(subset) == 0:
return "No examples found for this score."
n = min(len(subset), n_examples)
examples_text = []
# Randomly sample indices instead of taking the first n
total_available = len(subset)
random_indices = random.sample(range(total_available), n)
for idx in random_indices:
item = subset[idx]
example_id = item.get(ID_COLUMN, "Unknown")
text = item.get(TEXT_COLUMN, "")
summary = item.get(SUMMARY_COLUMN, "")
justification = item.get(JUSTIFICATION_COLUMN, "")
thinking = item.get(THINKING_COLUMN, "")
model = item.get(MODEL_COLUMN, "")
date = item.get(DATE_COLUMN, "")
# Build the example display
example_display = f"**Document ID:** {example_id}\n\n"
if show_details and summary:
example_display += f"**Summary:** {summary}\n\n"
if show_details and justification:
example_display += f"**Justification:** {justification}\n\n"
if show_details and thinking:
example_display += f"**Thinking Process:** {thinking}\n\n"
if show_details and model:
example_display += f"**Model:** {model} | **Date:** {date}\n\n"
example_display += f"**Text:**\n{text}\n\n---\n"
examples_text.append(example_display)
return "\n".join(examples_text)
def get_random_unseen_example(show_details: bool = False):
if current_dataset is None:
return "Please select and load a dataset first."
# Get all IDs we haven't seen
all_ids = set(current_dataset[ID_COLUMN])
unseen_ids = all_ids - seen_ids
if not unseen_ids:
# Reset if we've seen everything
seen_ids.clear()
unseen_ids = all_ids
if not unseen_ids:
return "No examples available in dataset."
# Pick random unseen ID
random_id = random.choice(list(unseen_ids))
seen_ids.add(random_id)
# Find the item with this ID
item_idx = current_dataset[ID_COLUMN].index(random_id)
item = current_dataset[item_idx]
# Extract data
text = item.get(TEXT_COLUMN, "")
score = item.get(SCORE_COLUMN, "N/A")
summary = item.get(SUMMARY_COLUMN, "")
justification = item.get(JUSTIFICATION_COLUMN, "")
thinking = item.get(THINKING_COLUMN, "")
model = item.get(MODEL_COLUMN, "")
date = item.get(DATE_COLUMN, "")
# Build display
display = f"**Document ID:** {random_id} | **Score:** {score}\n\n"
if show_details and summary:
display += f"**Summary:** {summary}\n\n"
if show_details and justification:
display += f"**Justification:** {justification}\n\n"
if show_details and thinking:
display += f"**Thinking Process:** {thinking}\n\n"
if show_details and model:
display += f"**Model:** {model} | **Date:** {date}\n\n"
display += f"**Text:**\n{text}"
return display
def build_interface():
with gr.Blocks(theme="default", title="Dataset Inspector") as demo:
gr.Markdown("# π Expert Content Classification Dataset Inspector")
with gr.Row():
with gr.Column(scale=2):
dataset_dropdown = gr.Dropdown(
choices=list(DATASETS.keys()),
label="Select Dataset",
value="Main Dataset"
)
with gr.Column(scale=1):
load_btn = gr.Button("Load Dataset", variant="primary")
status_display = gr.Markdown("")
with gr.Row():
show_details_global = gr.Checkbox(
label="Show annotation details (summary, justification, thinking)",
value=False
)
with gr.Tabs():
# Random sampling tab
with gr.Tab("π² Random Sampling"):
gr.Markdown("Sample random examples you haven't seen before")
with gr.Row():
sample_btn = gr.Button("Get Random Example", variant="secondary", size="lg")
random_output = gr.Markdown("")
# Score-based browsing tabs
for score in range(6):
with gr.Tab(f"β Score {score}"):
gr.Markdown(f"Browse examples with quality score {score}")
with gr.Row():
n_examples = gr.Slider(
minimum=1,
maximum=20,
value=3,
step=1,
label="Number of examples"
)
show_btn = gr.Button(f"Show Score {score} Examples", variant="secondary")
score_output = gr.Markdown("")
# Set up the click handler for this score
show_btn.click(
fn=lambda n, details, s=score: get_examples_by_score(s, n, details),
inputs=[n_examples, show_details_global],
outputs=score_output
)
# Event handlers
load_btn.click(
fn=load_selected_dataset,
inputs=dataset_dropdown,
outputs=status_display
)
sample_btn.click(
fn=get_random_unseen_example,
inputs=show_details_global,
outputs=random_output
)
# Load default dataset on startup
demo.load(
fn=lambda: load_selected_dataset("Main Dataset"),
outputs=status_display
)
return demo
if __name__ == "__main__":
demo = build_interface()
demo.launch() |