acecalisto3 commited on
Commit
dd12997
·
verified ·
1 Parent(s): 66507ca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -3
app.py CHANGED
@@ -6,7 +6,8 @@ from itertools import islice
6
  from functools import partial
7
  from multiprocessing.pool import ThreadPool
8
  from queue import Queue, Empty
9
- from typing import Callable, Iterable, Iterator, Optional, TypeVar
 
10
 
11
  import gradio as gr
12
  import pandas as pd
@@ -18,12 +19,18 @@ model_id = "microsoft/Phi-3-mini-4k-instruct"
18
  client = InferenceClient(model_id)
19
  save_dataset_hf_token = os.environ.get("SAVE_DATASET_HF_TOKEN")
20
 
 
 
21
  MAX_TOTAL_NB_ITEMS = 100 # almost infinite, don't judge me (actually it's because gradio needs a fixed number of components)
22
  MAX_NB_ITEMS_PER_GENERATION_CALL = 10
23
  NUM_ROWS = 100
24
  NUM_VARIANTS = 10
25
  NAMESPACE = "infinite-dataset-hub"
26
  URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"
 
 
 
 
27
 
28
  GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
29
  "A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. "
@@ -89,7 +96,23 @@ The dataset was generated using the [Infinite Dataset Hub]({url}) and {model_id}
89
  - **More Datasets**: https://huggingface.co/datasets?other=infinite-dataset-hub
90
  """
91
 
 
 
92
  css = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  a {
94
  color: var(--body-text-color);
95
  }
@@ -167,9 +190,10 @@ a {
167
  .settings button span {
168
  color: var(--body-text-color-subdued);
169
  }
170
- """
171
 
172
 
 
 
173
  with gr.Blocks(css=css) as demo:
174
  generated_texts_state = gr.State((landing_page_datasets_generated_text,))
175
  with gr.Column() as search_page:
@@ -226,6 +250,32 @@ with gr.Blocks(css=css) as demo:
226
  dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
227
  back_button = gr.Button("< Back", size="sm")
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  ###################################
230
  #
231
  # Utils
@@ -267,7 +317,56 @@ with gr.Blocks(css=css) as demo:
267
  continue
268
  break
269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
 
271
  def gen_datasets_line_by_line(search_query: str, generated_texts: tuple[str] = ()) -> Iterator[str]:
272
  search_query = search_query or ""
273
  search_query = search_query[:1000] if search_query.strip() else ""
@@ -506,6 +605,46 @@ with gr.Blocks(css=css) as demo:
506
  }
507
  current_item_idx += 1
508
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
  def _show_dataset(search_query, dataset_name, tags):
510
  yield {
511
  search_page: gr.Column(visible=False),
@@ -535,6 +674,29 @@ with gr.Blocks(css=css) as demo:
535
  }
536
  """
537
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
538
  def show_dataset_from_button(search_query, *buttons_values, i):
539
  dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
540
  yield from _show_dataset(search_query, dataset_name, tags)
@@ -642,5 +804,23 @@ with gr.Blocks(css=css) as demo:
642
  yield {search_page: gr.Column(visible=True)}
643
 
644
 
645
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
646
 
 
 
6
  from functools import partial
7
  from multiprocessing.pool import ThreadPool
8
  from queue import Queue, Empty
9
+ from typing import Callable, Iterable, Iterator, Optional, TypeVar, List, Dict
10
+ import datetime
11
 
12
  import gradio as gr
13
  import pandas as pd
 
19
  client = InferenceClient(model_id)
20
  save_dataset_hf_token = os.environ.get("SAVE_DATASET_HF_TOKEN")
21
 
22
+ AUTORUN_INTERVAL = 2 # Seconds between dataset generations
23
+ MAX_AUTORUN_DATASETS = 1000 # Safety limit for infinite mode
24
  MAX_TOTAL_NB_ITEMS = 100 # almost infinite, don't judge me (actually it's because gradio needs a fixed number of components)
25
  MAX_NB_ITEMS_PER_GENERATION_CALL = 10
26
  NUM_ROWS = 100
27
  NUM_VARIANTS = 10
28
  NAMESPACE = "infinite-dataset-hub"
29
  URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"
30
+ # Add these after existing state variables
31
+ autorun_active = gr.State(False)
32
+ accumulated_datasets = gr.State(pd.DataFrame())
33
+ current_processing = gr.State(set())
34
 
35
  GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
36
  "A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. "
 
96
  - **More Datasets**: https://huggingface.co/datasets?other=infinite-dataset-hub
97
  """
98
 
99
+
100
+
101
  css = """
102
+ .autorun-section {
103
+ border: 1px solid var(--border-color-primary);
104
+ border-radius: 8px;
105
+ padding: 1rem;
106
+ margin-top: 1rem;
107
+ }
108
+ .compile-options {
109
+ margin-top: 1rem;
110
+ }
111
+ .download-prompt {
112
+ color: var(--color-accent);
113
+ font-weight: bold;
114
+ margin-top: 1rem;
115
+ }
116
  a {
117
  color: var(--body-text-color);
118
  }
 
190
  .settings button span {
191
  color: var(--body-text-color-subdued);
192
  }
 
193
 
194
 
195
+ """
196
+
197
  with gr.Blocks(css=css) as demo:
198
  generated_texts_state = gr.State((landing_page_datasets_generated_text,))
199
  with gr.Column() as search_page:
 
250
  dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
251
  back_button = gr.Button("< Back", size="sm")
252
 
253
+ with gr.Column(elem_classes="autorun-section") as autorun_section:
254
+ with gr.Row():
255
+ autorun_toggle = gr.Checkbox(label="AutoRun Mode", interactive=True)
256
+ autorun_status = gr.Markdown("**Status:** Inactive", elem_classes="status")
257
+
258
+ with gr.Row():
259
+ compile_mode = gr.Radio(
260
+ ["Combine All", "Keep Separate"],
261
+ label="Compilation Mode",
262
+ value="Combine All"
263
+ )
264
+ processing_options = gr.CheckboxGroup(
265
+ ["Clean Data", "Chunk Data", "Summarize Data"],
266
+ label="Processing Options"
267
+ )
268
+
269
+ accumulated_display = gr.DataFrame(
270
+ label="Accumulated Data",
271
+ interactive=False,
272
+ wrap=True,
273
+ max_rows=50
274
+ )
275
+
276
+ with gr.Row():
277
+ download_btn = gr.DownloadButton("Download Dataset", visible=False)
278
+ stop_btn = gr.Button("Stop & Save", variant="stop", visible=False)
279
  ###################################
280
  #
281
  # Utils
 
317
  continue
318
  break
319
 
320
+ def generate_single_dataset(search_query: str) -> pd.DataFrame:
321
+ """Generate one complete dataset from search query to parsed DataFrame"""
322
+ # Generate dataset names
323
+ dataset_lines = []
324
+ for line in gen_datasets_line_by_line(search_query):
325
+ dataset_lines.append(line)
326
+ if len(dataset_lines) >= MAX_NB_ITEMS_PER_GENERATION_CALL:
327
+ break
328
+
329
+ # Process first valid dataset
330
+ for line in dataset_lines:
331
+ if line.strip() and line.strip().split(".", 1)[0].isnumeric():
332
+ try:
333
+ dataset_name, tags = line.strip().split(".", 1)[1].strip(" )").split(" (", 1)
334
+ break
335
+ except ValueError:
336
+ continue
337
+
338
+ # Generate dataset content
339
+ content = ""
340
+ for token in gen_dataset_content(search_query, dataset_name, tags):
341
+ content += token
342
+
343
+ # Parse to DataFrame
344
+ _, preview_df = parse_preview_df(content)
345
+ return preview_df
346
+
347
+ def process_dataset(df: pd.DataFrame, options: List[str]) -> pd.DataFrame:
348
+ """Apply processing options to dataset"""
349
+ # Clean
350
+ if 'Clean Data' in options:
351
+ df = df.dropna().drop_duplicates()
352
+
353
+ # Chunk
354
+ if 'Chunk Data' in options:
355
+ if len(df) > 10:
356
+ df = df.sample(frac=0.5) # Simple chunking example
357
+
358
+ # Summarize
359
+ if 'Summarize Data' in options:
360
+ summary = pd.DataFrame({
361
+ 'columns': df.columns,
362
+ 'dtypes': df.dtypes.values,
363
+ 'non_null_count': df.count().values
364
+ })
365
+ return summary
366
+
367
+ return df
368
 
369
+
370
  def gen_datasets_line_by_line(search_query: str, generated_texts: tuple[str] = ()) -> Iterator[str]:
371
  search_query = search_query or ""
372
  search_query = search_query[:1000] if search_query.strip() else ""
 
605
  }
606
  current_item_idx += 1
607
 
608
+ def toggle_autorun(active: bool, current_df: pd.DataFrame) -> dict:
609
+ """Toggle autorun state and UI elements"""
610
+ new_state = not active
611
+ updates = {
612
+ autorun_toggle: gr.Checkbox(value=new_state),
613
+ autorun_status: gr.Markdown(f"**Status:** {'Active' if new_state else 'Inactive'}"),
614
+ stop_btn: gr.Button(visible=new_state),
615
+ download_btn: gr.DownloadButton(visible=not new_state),
616
+ accumulated_datasets: current_df # Maintain current state
617
+ }
618
+ if new_state: # Reset when starting new run
619
+ updates[accumulated_datasets] = pd.DataFrame()
620
+ return updates
621
+
622
+ def autorun_iteration(
623
+ search_query: str,
624
+ current_df: pd.DataFrame,
625
+ compile_mode: str,
626
+ process_opts: List[str]
627
+ ) -> pd.DataFrame:
628
+ """Single iteration of autorun dataset generation"""
629
+ try:
630
+ new_data = generate_single_dataset(search_query)
631
+ processed = process_dataset(new_data, process_opts)
632
+
633
+ if compile_mode == "Combine All" and not current_df.empty:
634
+ combined = pd.concat([current_df, processed])
635
+ return combined
636
+ return processed
637
+ except Exception as e:
638
+ print(f"Error in autorun iteration: {e}")
639
+ return current_df
640
+
641
+ def create_download_file(current_df: pd.DataFrame) -> dict:
642
+ """Prepare dataset for download"""
643
+ timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
644
+ filename = f"autorun-dataset-{timestamp}.csv"
645
+ current_df.to_csv(filename, index=False)
646
+ return gr.DownloadButton(label=f"Download {filename}", value=filename)
647
+
648
  def _show_dataset(search_query, dataset_name, tags):
649
  yield {
650
  search_page: gr.Column(visible=False),
 
674
  }
675
  """
676
 
677
+ # Add these event bindings
678
+ autorun_toggle.change(
679
+ toggle_autorun,
680
+ inputs=[autorun_active, accumulated_datasets],
681
+ outputs=[autorun_toggle, autorun_status, stop_btn, download_btn, accumulated_datasets]
682
+ )
683
+
684
+ stop_btn.click(
685
+ fn=lambda: [
686
+ gr.Checkbox(value=False),
687
+ gr.Markdown("**Status:** Inactive"),
688
+ gr.Button(visible=False),
689
+ gr.DownloadButton(visible=True)
690
+ ],
691
+ outputs=[autorun_toggle, autorun_status, stop_btn, download_btn]
692
+ )
693
+
694
+ download_btn.click(
695
+ create_download_file,
696
+ inputs=accumulated_datasets,
697
+ outputs=download_btn
698
+ )
699
+
700
  def show_dataset_from_button(search_query, *buttons_values, i):
701
  dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
702
  yield from _show_dataset(search_query, dataset_name, tags)
 
804
  yield {search_page: gr.Column(visible=True)}
805
 
806
 
807
+ def run_autorun():
808
+ while True:
809
+ if demo.autorun_active:
810
+ yield [
811
+ autorun_iteration(
812
+ demo.search_bar.value,
813
+ demo.accumulated_datasets.value,
814
+ demo.compile_mode.value,
815
+ demo.processing_options.value
816
+ ),
817
+ gr.DataFrame(visible=True)
818
+ ]
819
+ time.sleep(AUTORUN_INTERVAL)
820
+ else:
821
+ yield [
822
+ demo.accumulated_datasets.value,
823
+ gr.DataFrame(visible=False)
824
+ ]
825
 
826
+ demo.queue(concurrency_count=5).launch()