lvwerra HF Staff commited on
Commit
aae972d
·
verified ·
1 Parent(s): 365f883

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -0
app.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from datasets import load_dataset
3
+ from huggingface_hub import HfApi
4
+ import pandas as pd
5
+
6
+ # Initialize HF API
7
+ api = HfApi()
8
+
9
+ def get_branches(repo_id="lvwerra/fineweb-ultra"):
10
+ """Get all branches from the dataset repository"""
11
+ try:
12
+ repo_info = api.repo_info(repo_id, repo_type="dataset")
13
+ branches = [ref.name for ref in repo_info.siblings if ref.name != "main"]
14
+ # Sort branches by timestamp (newest first)
15
+ branches.sort(reverse=True)
16
+ return branches
17
+ except Exception as e:
18
+ print(f"Error fetching branches: {e}")
19
+ return []
20
+
21
+ def load_branch_data(repo_id, branch_name):
22
+ """Load dataset from a specific branch"""
23
+ try:
24
+ dataset = load_dataset(repo_id, revision=branch_name, split="train")
25
+ return dataset
26
+ except Exception as e:
27
+ print(f"Error loading branch {branch_name}: {e}")
28
+ return None
29
+
30
+ def update_branch_dropdown():
31
+ """Update the branch dropdown with available branches"""
32
+ branches = get_branches()
33
+ if branches:
34
+ return gr.Dropdown(choices=branches, value=branches[0], label="Select Branch")
35
+ else:
36
+ return gr.Dropdown(choices=[], value=None, label="No branches found")
37
+
38
+ def load_dataset_for_branch(branch_name):
39
+ """Load dataset when branch is selected"""
40
+ if not branch_name:
41
+ return None, gr.Slider(maximum=0, value=0), "", ""
42
+
43
+ dataset = load_branch_data("lvwerra/fineweb-ultra", branch_name)
44
+ if dataset is None:
45
+ return None, gr.Slider(maximum=0, value=0), "Error loading dataset", "Error loading dataset"
46
+
47
+ max_samples = len(dataset) - 1
48
+
49
+ # Load first sample
50
+ sample = dataset[0]
51
+ original_text = sample.get("original_text", sample.get("text", "No original text found"))
52
+ rephrased_text = sample.get("rephrased_text", "No rephrased text found")
53
+
54
+ return dataset, gr.Slider(maximum=max_samples, value=0, step=1, label=f"Sample Index (0-{max_samples})"), original_text, rephrased_text
55
+
56
+ def update_sample(dataset, sample_idx):
57
+ """Update the text display when slider changes"""
58
+ if dataset is None or sample_idx >= len(dataset):
59
+ return "No data available", "No data available"
60
+
61
+ sample = dataset[int(sample_idx)]
62
+ original_text = sample.get("original_text", sample.get("text", "No original text found"))
63
+ rephrased_text = sample.get("rephrased_text", "No rephrased text found")
64
+
65
+ return original_text, rephrased_text
66
+
67
+ def format_text_for_display(text, title):
68
+ """Format text with a title for better display"""
69
+ return f"## {title}\n\n{text}"
70
+
71
+ # Create Gradio interface
72
+ with gr.Blocks(title="Dataset Branch Viewer", theme=gr.themes.Soft()) as demo:
73
+ gr.Markdown("# Dataset Branch Viewer")
74
+ gr.Markdown("Compare original and rephrased text samples from different dataset branches")
75
+
76
+ # Store dataset in state
77
+ dataset_state = gr.State(value=None)
78
+
79
+ with gr.Row():
80
+ with gr.Column(scale=1):
81
+ refresh_btn = gr.Button("🔄 Refresh Branches", variant="secondary")
82
+ branch_dropdown = gr.Dropdown(
83
+ choices=get_branches(),
84
+ value=get_branches()[0] if get_branches() else None,
85
+ label="Select Branch",
86
+ info="Choose a timestamp branch to view"
87
+ )
88
+
89
+ sample_slider = gr.Slider(
90
+ minimum=0,
91
+ maximum=0,
92
+ value=0,
93
+ step=1,
94
+ label="Sample Index",
95
+ info="Navigate through samples"
96
+ )
97
+
98
+ with gr.Row():
99
+ gr.Markdown("### Sample Info")
100
+ sample_info = gr.Markdown("Select a branch to start")
101
+
102
+ with gr.Row():
103
+ with gr.Column():
104
+ original_display = gr.Markdown(
105
+ "## Original Text\n\nSelect a branch and sample to view content",
106
+ label="Original Text"
107
+ )
108
+
109
+ with gr.Column():
110
+ rephrased_display = gr.Markdown(
111
+ "## Rephrased Text\n\nSelect a branch and sample to view content",
112
+ label="Rephrased Text"
113
+ )
114
+
115
+ # Event handlers
116
+ refresh_btn.click(
117
+ fn=update_branch_dropdown,
118
+ outputs=[branch_dropdown]
119
+ )
120
+
121
+ branch_dropdown.change(
122
+ fn=load_dataset_for_branch,
123
+ inputs=[branch_dropdown],
124
+ outputs=[dataset_state, sample_slider, original_display, rephrased_display]
125
+ )
126
+
127
+ sample_slider.change(
128
+ fn=update_sample,
129
+ inputs=[dataset_state, sample_slider],
130
+ outputs=[original_display, rephrased_display]
131
+ )
132
+
133
+ # Update sample info when slider changes
134
+ def update_sample_info(dataset, sample_idx):
135
+ if dataset is None:
136
+ return "No dataset loaded"
137
+
138
+ total_samples = len(dataset)
139
+ current_sample = int(sample_idx)
140
+ sample = dataset[current_sample]
141
+ sample_id = sample.get("id", "Unknown")
142
+
143
+ return f"**Sample {current_sample + 1} of {total_samples}** | ID: `{sample_id}`"
144
+
145
+ sample_slider.change(
146
+ fn=update_sample_info,
147
+ inputs=[dataset_state, sample_slider],
148
+ outputs=[sample_info]
149
+ )
150
+
151
+ # Load initial data if branches exist
152
+ initial_branches = get_branches()
153
+ if initial_branches:
154
+ demo.load(
155
+ fn=load_dataset_for_branch,
156
+ inputs=[gr.State(initial_branches[0])],
157
+ outputs=[dataset_state, sample_slider, original_display, rephrased_display]
158
+ )
159
+
160
+ # Launch the app
161
+ if __name__ == "__main__":
162
+ demo.launch()