Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,8 @@ from itertools import islice
|
|
6 |
from functools import partial
|
7 |
from multiprocessing.pool import ThreadPool
|
8 |
from queue import Queue, Empty
|
9 |
-
from typing import Callable, Iterable, Iterator, Optional, TypeVar
|
|
|
10 |
|
11 |
import gradio as gr
|
12 |
import pandas as pd
|
@@ -18,12 +19,18 @@ model_id = "microsoft/Phi-3-mini-4k-instruct"
|
|
18 |
client = InferenceClient(model_id)
|
19 |
save_dataset_hf_token = os.environ.get("SAVE_DATASET_HF_TOKEN")
|
20 |
|
|
|
|
|
21 |
MAX_TOTAL_NB_ITEMS = 100 # almost infinite, don't judge me (actually it's because gradio needs a fixed number of components)
|
22 |
MAX_NB_ITEMS_PER_GENERATION_CALL = 10
|
23 |
NUM_ROWS = 100
|
24 |
NUM_VARIANTS = 10
|
25 |
NAMESPACE = "infinite-dataset-hub"
|
26 |
URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"
|
|
|
|
|
|
|
|
|
27 |
|
28 |
GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
|
29 |
"A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. "
|
@@ -89,7 +96,23 @@ The dataset was generated using the [Infinite Dataset Hub]({url}) and {model_id}
|
|
89 |
- **More Datasets**: https://huggingface.co/datasets?other=infinite-dataset-hub
|
90 |
"""
|
91 |
|
|
|
|
|
92 |
css = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
a {
|
94 |
color: var(--body-text-color);
|
95 |
}
|
@@ -167,9 +190,10 @@ a {
|
|
167 |
.settings button span {
|
168 |
color: var(--body-text-color-subdued);
|
169 |
}
|
170 |
-
"""
|
171 |
|
172 |
|
|
|
|
|
173 |
with gr.Blocks(css=css) as demo:
|
174 |
generated_texts_state = gr.State((landing_page_datasets_generated_text,))
|
175 |
with gr.Column() as search_page:
|
@@ -226,6 +250,32 @@ with gr.Blocks(css=css) as demo:
|
|
226 |
dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
|
227 |
back_button = gr.Button("< Back", size="sm")
|
228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
###################################
|
230 |
#
|
231 |
# Utils
|
@@ -267,7 +317,56 @@ with gr.Blocks(css=css) as demo:
|
|
267 |
continue
|
268 |
break
|
269 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
|
|
|
271 |
def gen_datasets_line_by_line(search_query: str, generated_texts: tuple[str] = ()) -> Iterator[str]:
|
272 |
search_query = search_query or ""
|
273 |
search_query = search_query[:1000] if search_query.strip() else ""
|
@@ -506,6 +605,46 @@ with gr.Blocks(css=css) as demo:
|
|
506 |
}
|
507 |
current_item_idx += 1
|
508 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
509 |
def _show_dataset(search_query, dataset_name, tags):
|
510 |
yield {
|
511 |
search_page: gr.Column(visible=False),
|
@@ -535,6 +674,29 @@ with gr.Blocks(css=css) as demo:
|
|
535 |
}
|
536 |
"""
|
537 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
538 |
def show_dataset_from_button(search_query, *buttons_values, i):
|
539 |
dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
|
540 |
yield from _show_dataset(search_query, dataset_name, tags)
|
@@ -642,5 +804,23 @@ with gr.Blocks(css=css) as demo:
|
|
642 |
yield {search_page: gr.Column(visible=True)}
|
643 |
|
644 |
|
645 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
646 |
|
|
|
|
6 |
from functools import partial
|
7 |
from multiprocessing.pool import ThreadPool
|
8 |
from queue import Queue, Empty
|
9 |
+
from typing import Callable, Iterable, Iterator, Optional, TypeVar, List, Dict
|
10 |
+
import datetime
|
11 |
|
12 |
import gradio as gr
|
13 |
import pandas as pd
|
|
|
19 |
client = InferenceClient(model_id)
|
20 |
save_dataset_hf_token = os.environ.get("SAVE_DATASET_HF_TOKEN")
|
21 |
|
22 |
+
AUTORUN_INTERVAL = 2 # Seconds between dataset generations
|
23 |
+
MAX_AUTORUN_DATASETS = 1000 # Safety limit for infinite mode
|
24 |
MAX_TOTAL_NB_ITEMS = 100 # almost infinite, don't judge me (actually it's because gradio needs a fixed number of components)
|
25 |
MAX_NB_ITEMS_PER_GENERATION_CALL = 10
|
26 |
NUM_ROWS = 100
|
27 |
NUM_VARIANTS = 10
|
28 |
NAMESPACE = "infinite-dataset-hub"
|
29 |
URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"
|
30 |
+
# Add these after existing state variables
|
31 |
+
autorun_active = gr.State(False)
|
32 |
+
accumulated_datasets = gr.State(pd.DataFrame())
|
33 |
+
current_processing = gr.State(set())
|
34 |
|
35 |
GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
|
36 |
"A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. "
|
|
|
96 |
- **More Datasets**: https://huggingface.co/datasets?other=infinite-dataset-hub
|
97 |
"""
|
98 |
|
99 |
+
|
100 |
+
|
101 |
css = """
|
102 |
+
.autorun-section {
|
103 |
+
border: 1px solid var(--border-color-primary);
|
104 |
+
border-radius: 8px;
|
105 |
+
padding: 1rem;
|
106 |
+
margin-top: 1rem;
|
107 |
+
}
|
108 |
+
.compile-options {
|
109 |
+
margin-top: 1rem;
|
110 |
+
}
|
111 |
+
.download-prompt {
|
112 |
+
color: var(--color-accent);
|
113 |
+
font-weight: bold;
|
114 |
+
margin-top: 1rem;
|
115 |
+
}
|
116 |
a {
|
117 |
color: var(--body-text-color);
|
118 |
}
|
|
|
190 |
.settings button span {
|
191 |
color: var(--body-text-color-subdued);
|
192 |
}
|
|
|
193 |
|
194 |
|
195 |
+
"""
|
196 |
+
|
197 |
with gr.Blocks(css=css) as demo:
|
198 |
generated_texts_state = gr.State((landing_page_datasets_generated_text,))
|
199 |
with gr.Column() as search_page:
|
|
|
250 |
dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
|
251 |
back_button = gr.Button("< Back", size="sm")
|
252 |
|
253 |
+
with gr.Column(elem_classes="autorun-section") as autorun_section:
|
254 |
+
with gr.Row():
|
255 |
+
autorun_toggle = gr.Checkbox(label="AutoRun Mode", interactive=True)
|
256 |
+
autorun_status = gr.Markdown("**Status:** Inactive", elem_classes="status")
|
257 |
+
|
258 |
+
with gr.Row():
|
259 |
+
compile_mode = gr.Radio(
|
260 |
+
["Combine All", "Keep Separate"],
|
261 |
+
label="Compilation Mode",
|
262 |
+
value="Combine All"
|
263 |
+
)
|
264 |
+
processing_options = gr.CheckboxGroup(
|
265 |
+
["Clean Data", "Chunk Data", "Summarize Data"],
|
266 |
+
label="Processing Options"
|
267 |
+
)
|
268 |
+
|
269 |
+
accumulated_display = gr.DataFrame(
|
270 |
+
label="Accumulated Data",
|
271 |
+
interactive=False,
|
272 |
+
wrap=True,
|
273 |
+
max_rows=50
|
274 |
+
)
|
275 |
+
|
276 |
+
with gr.Row():
|
277 |
+
download_btn = gr.DownloadButton("Download Dataset", visible=False)
|
278 |
+
stop_btn = gr.Button("Stop & Save", variant="stop", visible=False)
|
279 |
###################################
|
280 |
#
|
281 |
# Utils
|
|
|
317 |
continue
|
318 |
break
|
319 |
|
320 |
+
def generate_single_dataset(search_query: str) -> pd.DataFrame:
|
321 |
+
"""Generate one complete dataset from search query to parsed DataFrame"""
|
322 |
+
# Generate dataset names
|
323 |
+
dataset_lines = []
|
324 |
+
for line in gen_datasets_line_by_line(search_query):
|
325 |
+
dataset_lines.append(line)
|
326 |
+
if len(dataset_lines) >= MAX_NB_ITEMS_PER_GENERATION_CALL:
|
327 |
+
break
|
328 |
+
|
329 |
+
# Process first valid dataset
|
330 |
+
for line in dataset_lines:
|
331 |
+
if line.strip() and line.strip().split(".", 1)[0].isnumeric():
|
332 |
+
try:
|
333 |
+
dataset_name, tags = line.strip().split(".", 1)[1].strip(" )").split(" (", 1)
|
334 |
+
break
|
335 |
+
except ValueError:
|
336 |
+
continue
|
337 |
+
|
338 |
+
# Generate dataset content
|
339 |
+
content = ""
|
340 |
+
for token in gen_dataset_content(search_query, dataset_name, tags):
|
341 |
+
content += token
|
342 |
+
|
343 |
+
# Parse to DataFrame
|
344 |
+
_, preview_df = parse_preview_df(content)
|
345 |
+
return preview_df
|
346 |
+
|
347 |
+
def process_dataset(df: pd.DataFrame, options: List[str]) -> pd.DataFrame:
|
348 |
+
"""Apply processing options to dataset"""
|
349 |
+
# Clean
|
350 |
+
if 'Clean Data' in options:
|
351 |
+
df = df.dropna().drop_duplicates()
|
352 |
+
|
353 |
+
# Chunk
|
354 |
+
if 'Chunk Data' in options:
|
355 |
+
if len(df) > 10:
|
356 |
+
df = df.sample(frac=0.5) # Simple chunking example
|
357 |
+
|
358 |
+
# Summarize
|
359 |
+
if 'Summarize Data' in options:
|
360 |
+
summary = pd.DataFrame({
|
361 |
+
'columns': df.columns,
|
362 |
+
'dtypes': df.dtypes.values,
|
363 |
+
'non_null_count': df.count().values
|
364 |
+
})
|
365 |
+
return summary
|
366 |
+
|
367 |
+
return df
|
368 |
|
369 |
+
|
370 |
def gen_datasets_line_by_line(search_query: str, generated_texts: tuple[str] = ()) -> Iterator[str]:
|
371 |
search_query = search_query or ""
|
372 |
search_query = search_query[:1000] if search_query.strip() else ""
|
|
|
605 |
}
|
606 |
current_item_idx += 1
|
607 |
|
608 |
+
def toggle_autorun(active: bool, current_df: pd.DataFrame) -> dict:
|
609 |
+
"""Toggle autorun state and UI elements"""
|
610 |
+
new_state = not active
|
611 |
+
updates = {
|
612 |
+
autorun_toggle: gr.Checkbox(value=new_state),
|
613 |
+
autorun_status: gr.Markdown(f"**Status:** {'Active' if new_state else 'Inactive'}"),
|
614 |
+
stop_btn: gr.Button(visible=new_state),
|
615 |
+
download_btn: gr.DownloadButton(visible=not new_state),
|
616 |
+
accumulated_datasets: current_df # Maintain current state
|
617 |
+
}
|
618 |
+
if new_state: # Reset when starting new run
|
619 |
+
updates[accumulated_datasets] = pd.DataFrame()
|
620 |
+
return updates
|
621 |
+
|
622 |
+
def autorun_iteration(
|
623 |
+
search_query: str,
|
624 |
+
current_df: pd.DataFrame,
|
625 |
+
compile_mode: str,
|
626 |
+
process_opts: List[str]
|
627 |
+
) -> pd.DataFrame:
|
628 |
+
"""Single iteration of autorun dataset generation"""
|
629 |
+
try:
|
630 |
+
new_data = generate_single_dataset(search_query)
|
631 |
+
processed = process_dataset(new_data, process_opts)
|
632 |
+
|
633 |
+
if compile_mode == "Combine All" and not current_df.empty:
|
634 |
+
combined = pd.concat([current_df, processed])
|
635 |
+
return combined
|
636 |
+
return processed
|
637 |
+
except Exception as e:
|
638 |
+
print(f"Error in autorun iteration: {e}")
|
639 |
+
return current_df
|
640 |
+
|
641 |
+
def create_download_file(current_df: pd.DataFrame) -> dict:
|
642 |
+
"""Prepare dataset for download"""
|
643 |
+
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
|
644 |
+
filename = f"autorun-dataset-{timestamp}.csv"
|
645 |
+
current_df.to_csv(filename, index=False)
|
646 |
+
return gr.DownloadButton(label=f"Download {filename}", value=filename)
|
647 |
+
|
648 |
def _show_dataset(search_query, dataset_name, tags):
|
649 |
yield {
|
650 |
search_page: gr.Column(visible=False),
|
|
|
674 |
}
|
675 |
"""
|
676 |
|
677 |
+
# Add these event bindings
|
678 |
+
autorun_toggle.change(
|
679 |
+
toggle_autorun,
|
680 |
+
inputs=[autorun_active, accumulated_datasets],
|
681 |
+
outputs=[autorun_toggle, autorun_status, stop_btn, download_btn, accumulated_datasets]
|
682 |
+
)
|
683 |
+
|
684 |
+
stop_btn.click(
|
685 |
+
fn=lambda: [
|
686 |
+
gr.Checkbox(value=False),
|
687 |
+
gr.Markdown("**Status:** Inactive"),
|
688 |
+
gr.Button(visible=False),
|
689 |
+
gr.DownloadButton(visible=True)
|
690 |
+
],
|
691 |
+
outputs=[autorun_toggle, autorun_status, stop_btn, download_btn]
|
692 |
+
)
|
693 |
+
|
694 |
+
download_btn.click(
|
695 |
+
create_download_file,
|
696 |
+
inputs=accumulated_datasets,
|
697 |
+
outputs=download_btn
|
698 |
+
)
|
699 |
+
|
700 |
def show_dataset_from_button(search_query, *buttons_values, i):
|
701 |
dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
|
702 |
yield from _show_dataset(search_query, dataset_name, tags)
|
|
|
804 |
yield {search_page: gr.Column(visible=True)}
|
805 |
|
806 |
|
807 |
+
def run_autorun():
|
808 |
+
while True:
|
809 |
+
if demo.autorun_active:
|
810 |
+
yield [
|
811 |
+
autorun_iteration(
|
812 |
+
demo.search_bar.value,
|
813 |
+
demo.accumulated_datasets.value,
|
814 |
+
demo.compile_mode.value,
|
815 |
+
demo.processing_options.value
|
816 |
+
),
|
817 |
+
gr.DataFrame(visible=True)
|
818 |
+
]
|
819 |
+
time.sleep(AUTORUN_INTERVAL)
|
820 |
+
else:
|
821 |
+
yield [
|
822 |
+
demo.accumulated_datasets.value,
|
823 |
+
gr.DataFrame(visible=False)
|
824 |
+
]
|
825 |
|
826 |
+
demo.queue(concurrency_count=5).launch()
|