Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from datasets import load_dataset
|
3 |
+
from huggingface_hub import HfApi
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
# Initialize HF API
|
7 |
+
api = HfApi()
|
8 |
+
|
9 |
+
def get_branches(repo_id="lvwerra/fineweb-ultra"):
|
10 |
+
"""Get all branches from the dataset repository"""
|
11 |
+
try:
|
12 |
+
repo_info = api.repo_info(repo_id, repo_type="dataset")
|
13 |
+
branches = [ref.name for ref in repo_info.siblings if ref.name != "main"]
|
14 |
+
# Sort branches by timestamp (newest first)
|
15 |
+
branches.sort(reverse=True)
|
16 |
+
return branches
|
17 |
+
except Exception as e:
|
18 |
+
print(f"Error fetching branches: {e}")
|
19 |
+
return []
|
20 |
+
|
21 |
+
def load_branch_data(repo_id, branch_name):
|
22 |
+
"""Load dataset from a specific branch"""
|
23 |
+
try:
|
24 |
+
dataset = load_dataset(repo_id, revision=branch_name, split="train")
|
25 |
+
return dataset
|
26 |
+
except Exception as e:
|
27 |
+
print(f"Error loading branch {branch_name}: {e}")
|
28 |
+
return None
|
29 |
+
|
30 |
+
def update_branch_dropdown():
|
31 |
+
"""Update the branch dropdown with available branches"""
|
32 |
+
branches = get_branches()
|
33 |
+
if branches:
|
34 |
+
return gr.Dropdown(choices=branches, value=branches[0], label="Select Branch")
|
35 |
+
else:
|
36 |
+
return gr.Dropdown(choices=[], value=None, label="No branches found")
|
37 |
+
|
38 |
+
def load_dataset_for_branch(branch_name):
|
39 |
+
"""Load dataset when branch is selected"""
|
40 |
+
if not branch_name:
|
41 |
+
return None, gr.Slider(maximum=0, value=0), "", ""
|
42 |
+
|
43 |
+
dataset = load_branch_data("lvwerra/fineweb-ultra", branch_name)
|
44 |
+
if dataset is None:
|
45 |
+
return None, gr.Slider(maximum=0, value=0), "Error loading dataset", "Error loading dataset"
|
46 |
+
|
47 |
+
max_samples = len(dataset) - 1
|
48 |
+
|
49 |
+
# Load first sample
|
50 |
+
sample = dataset[0]
|
51 |
+
original_text = sample.get("original_text", sample.get("text", "No original text found"))
|
52 |
+
rephrased_text = sample.get("rephrased_text", "No rephrased text found")
|
53 |
+
|
54 |
+
return dataset, gr.Slider(maximum=max_samples, value=0, step=1, label=f"Sample Index (0-{max_samples})"), original_text, rephrased_text
|
55 |
+
|
56 |
+
def update_sample(dataset, sample_idx):
|
57 |
+
"""Update the text display when slider changes"""
|
58 |
+
if dataset is None or sample_idx >= len(dataset):
|
59 |
+
return "No data available", "No data available"
|
60 |
+
|
61 |
+
sample = dataset[int(sample_idx)]
|
62 |
+
original_text = sample.get("original_text", sample.get("text", "No original text found"))
|
63 |
+
rephrased_text = sample.get("rephrased_text", "No rephrased text found")
|
64 |
+
|
65 |
+
return original_text, rephrased_text
|
66 |
+
|
67 |
+
def format_text_for_display(text, title):
|
68 |
+
"""Format text with a title for better display"""
|
69 |
+
return f"## {title}\n\n{text}"
|
70 |
+
|
71 |
+
# Create Gradio interface
|
72 |
+
with gr.Blocks(title="Dataset Branch Viewer", theme=gr.themes.Soft()) as demo:
|
73 |
+
gr.Markdown("# Dataset Branch Viewer")
|
74 |
+
gr.Markdown("Compare original and rephrased text samples from different dataset branches")
|
75 |
+
|
76 |
+
# Store dataset in state
|
77 |
+
dataset_state = gr.State(value=None)
|
78 |
+
|
79 |
+
with gr.Row():
|
80 |
+
with gr.Column(scale=1):
|
81 |
+
refresh_btn = gr.Button("🔄 Refresh Branches", variant="secondary")
|
82 |
+
branch_dropdown = gr.Dropdown(
|
83 |
+
choices=get_branches(),
|
84 |
+
value=get_branches()[0] if get_branches() else None,
|
85 |
+
label="Select Branch",
|
86 |
+
info="Choose a timestamp branch to view"
|
87 |
+
)
|
88 |
+
|
89 |
+
sample_slider = gr.Slider(
|
90 |
+
minimum=0,
|
91 |
+
maximum=0,
|
92 |
+
value=0,
|
93 |
+
step=1,
|
94 |
+
label="Sample Index",
|
95 |
+
info="Navigate through samples"
|
96 |
+
)
|
97 |
+
|
98 |
+
with gr.Row():
|
99 |
+
gr.Markdown("### Sample Info")
|
100 |
+
sample_info = gr.Markdown("Select a branch to start")
|
101 |
+
|
102 |
+
with gr.Row():
|
103 |
+
with gr.Column():
|
104 |
+
original_display = gr.Markdown(
|
105 |
+
"## Original Text\n\nSelect a branch and sample to view content",
|
106 |
+
label="Original Text"
|
107 |
+
)
|
108 |
+
|
109 |
+
with gr.Column():
|
110 |
+
rephrased_display = gr.Markdown(
|
111 |
+
"## Rephrased Text\n\nSelect a branch and sample to view content",
|
112 |
+
label="Rephrased Text"
|
113 |
+
)
|
114 |
+
|
115 |
+
# Event handlers
|
116 |
+
refresh_btn.click(
|
117 |
+
fn=update_branch_dropdown,
|
118 |
+
outputs=[branch_dropdown]
|
119 |
+
)
|
120 |
+
|
121 |
+
branch_dropdown.change(
|
122 |
+
fn=load_dataset_for_branch,
|
123 |
+
inputs=[branch_dropdown],
|
124 |
+
outputs=[dataset_state, sample_slider, original_display, rephrased_display]
|
125 |
+
)
|
126 |
+
|
127 |
+
sample_slider.change(
|
128 |
+
fn=update_sample,
|
129 |
+
inputs=[dataset_state, sample_slider],
|
130 |
+
outputs=[original_display, rephrased_display]
|
131 |
+
)
|
132 |
+
|
133 |
+
# Update sample info when slider changes
|
134 |
+
def update_sample_info(dataset, sample_idx):
|
135 |
+
if dataset is None:
|
136 |
+
return "No dataset loaded"
|
137 |
+
|
138 |
+
total_samples = len(dataset)
|
139 |
+
current_sample = int(sample_idx)
|
140 |
+
sample = dataset[current_sample]
|
141 |
+
sample_id = sample.get("id", "Unknown")
|
142 |
+
|
143 |
+
return f"**Sample {current_sample + 1} of {total_samples}** | ID: `{sample_id}`"
|
144 |
+
|
145 |
+
sample_slider.change(
|
146 |
+
fn=update_sample_info,
|
147 |
+
inputs=[dataset_state, sample_slider],
|
148 |
+
outputs=[sample_info]
|
149 |
+
)
|
150 |
+
|
151 |
+
# Load initial data if branches exist
|
152 |
+
initial_branches = get_branches()
|
153 |
+
if initial_branches:
|
154 |
+
demo.load(
|
155 |
+
fn=load_dataset_for_branch,
|
156 |
+
inputs=[gr.State(initial_branches[0])],
|
157 |
+
outputs=[dataset_state, sample_slider, original_display, rephrased_display]
|
158 |
+
)
|
159 |
+
|
160 |
+
# Launch the app
|
161 |
+
if __name__ == "__main__":
|
162 |
+
demo.launch()
|