Infini-d-set

Runtime error

App Files Files Community

acecalisto3 commited on Feb 7

Commit

dd12997

verified ·

1 Parent(s): 66507ca

Update app.py

Browse files

Files changed (1) hide show

app.py +183 -3

app.py CHANGED Viewed

@@ -6,7 +6,8 @@ from itertools import islice
 from functools import partial
 from multiprocessing.pool import ThreadPool
 from queue import Queue, Empty
-from typing import Callable, Iterable, Iterator, Optional, TypeVar
 import gradio as gr
 import pandas as pd
@@ -18,12 +19,18 @@ model_id = "microsoft/Phi-3-mini-4k-instruct"
 client = InferenceClient(model_id)
 save_dataset_hf_token = os.environ.get("SAVE_DATASET_HF_TOKEN")
 MAX_TOTAL_NB_ITEMS = 100  # almost infinite, don't judge me (actually it's because gradio needs a fixed number of components)
 MAX_NB_ITEMS_PER_GENERATION_CALL = 10
 NUM_ROWS = 100
 NUM_VARIANTS = 10
 NAMESPACE = "infinite-dataset-hub"
 URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"
 GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
         "A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. "
@@ -89,7 +96,23 @@ The dataset was generated using the [Infinite Dataset Hub]({url}) and {model_id}
 - **More Datasets**: https://huggingface.co/datasets?other=infinite-dataset-hub
 """
 css = """
 a {
     color: var(--body-text-color);
 }
@@ -167,9 +190,10 @@ a {
 .settings button span {
     color: var(--body-text-color-subdued);
 }
-"""
 with gr.Blocks(css=css) as demo:
     generated_texts_state = gr.State((landing_page_datasets_generated_text,))
     with gr.Column() as search_page:
@@ -226,6 +250,32 @@ with gr.Blocks(css=css) as demo:
         dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
         back_button = gr.Button("< Back", size="sm")
     ###################################
     #
     #       Utils
@@ -267,7 +317,56 @@ with gr.Blocks(css=css) as demo:
                 continue
             break
     def gen_datasets_line_by_line(search_query: str, generated_texts: tuple[str] = ()) -> Iterator[str]:
         search_query = search_query or ""
         search_query = search_query[:1000] if search_query.strip() else ""
@@ -506,6 +605,46 @@ with gr.Blocks(css=css) as demo:
                 }
                 current_item_idx += 1
     def _show_dataset(search_query, dataset_name, tags):
         yield {
             search_page: gr.Column(visible=False),
@@ -535,6 +674,29 @@ with gr.Blocks(css=css) as demo:
     }
     """
     def show_dataset_from_button(search_query, *buttons_values, i):
         dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
         yield from _show_dataset(search_query, dataset_name, tags)
@@ -642,5 +804,23 @@ with gr.Blocks(css=css) as demo:
             yield {search_page: gr.Column(visible=True)}
-demo.launch()

 from functools import partial
 from multiprocessing.pool import ThreadPool
 from queue import Queue, Empty
+from typing import Callable, Iterable, Iterator, Optional, TypeVar, List, Dict
+import datetime
 import gradio as gr
 import pandas as pd
 client = InferenceClient(model_id)
 save_dataset_hf_token = os.environ.get("SAVE_DATASET_HF_TOKEN")
+AUTORUN_INTERVAL = 2  # Seconds between dataset generations
+MAX_AUTORUN_DATASETS = 1000  # Safety limit for infinite mode
 MAX_TOTAL_NB_ITEMS = 100  # almost infinite, don't judge me (actually it's because gradio needs a fixed number of components)
 MAX_NB_ITEMS_PER_GENERATION_CALL = 10
 NUM_ROWS = 100
 NUM_VARIANTS = 10
 NAMESPACE = "infinite-dataset-hub"
 URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"
+# Add these after existing state variables
+autorun_active = gr.State(False)
+accumulated_datasets = gr.State(pd.DataFrame())
+current_processing = gr.State(set())
 GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
         "A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. "
 - **More Datasets**: https://huggingface.co/datasets?other=infinite-dataset-hub
 """
 css = """
+.autorun-section {
+    border: 1px solid var(--border-color-primary);
+    border-radius: 8px;
+    padding: 1rem;
+    margin-top: 1rem;
+}
+.compile-options {
+    margin-top: 1rem;
+}
+.download-prompt {
+    color: var(--color-accent);
+    font-weight: bold;
+    margin-top: 1rem;
+}
 a {
     color: var(--body-text-color);
 }
 .settings button span {
     color: var(--body-text-color-subdued);
 }
+"""
 with gr.Blocks(css=css) as demo:
     generated_texts_state = gr.State((landing_page_datasets_generated_text,))
     with gr.Column() as search_page:
         dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
         back_button = gr.Button("< Back", size="sm")
+    with gr.Column(elem_classes="autorun-section") as autorun_section:
+    with gr.Row():
+        autorun_toggle = gr.Checkbox(label="AutoRun Mode", interactive=True)
+        autorun_status = gr.Markdown("**Status:** Inactive", elem_classes="status")
+    with gr.Row():
+        compile_mode = gr.Radio(
+            ["Combine All", "Keep Separate"],
+            label="Compilation Mode",
+            value="Combine All"
+        )
+        processing_options = gr.CheckboxGroup(
+            ["Clean Data", "Chunk Data", "Summarize Data"],
+            label="Processing Options"
+        )
+    accumulated_display = gr.DataFrame(
+        label="Accumulated Data",
+        interactive=False,
+        wrap=True,
+        max_rows=50
+    )
+    with gr.Row():
+        download_btn = gr.DownloadButton("Download Dataset", visible=False)
+        stop_btn = gr.Button("Stop & Save", variant="stop", visible=False)
     ###################################
     #
     #       Utils
                 continue
             break
+    def generate_single_dataset(search_query: str) -> pd.DataFrame:
+        """Generate one complete dataset from search query to parsed DataFrame"""
+        # Generate dataset names
+        dataset_lines = []
+        for line in gen_datasets_line_by_line(search_query):
+            dataset_lines.append(line)
+            if len(dataset_lines) >= MAX_NB_ITEMS_PER_GENERATION_CALL:
+                break
+        # Process first valid dataset
+        for line in dataset_lines:
+            if line.strip() and line.strip().split(".", 1)[0].isnumeric():
+                try:
+                    dataset_name, tags = line.strip().split(".", 1)[1].strip(" )").split(" (", 1)
+                    break
+                except ValueError:
+                    continue
+        # Generate dataset content
+        content = ""
+        for token in gen_dataset_content(search_query, dataset_name, tags):
+            content += token
+        # Parse to DataFrame
+        _, preview_df = parse_preview_df(content)
+        return preview_df
+    def process_dataset(df: pd.DataFrame, options: List[str]) -> pd.DataFrame:
+        """Apply processing options to dataset"""
+        # Clean
+        if 'Clean Data' in options:
+            df = df.dropna().drop_duplicates()
+        # Chunk
+        if 'Chunk Data' in options:
+            if len(df) > 10:
+                df = df.sample(frac=0.5)  # Simple chunking example
+        # Summarize
+        if 'Summarize Data' in options:
+            summary = pd.DataFrame({
+                'columns': df.columns,
+                'dtypes': df.dtypes.values,
+                'non_null_count': df.count().values
+            })
+            return summary
+        return df
     def gen_datasets_line_by_line(search_query: str, generated_texts: tuple[str] = ()) -> Iterator[str]:
         search_query = search_query or ""
         search_query = search_query[:1000] if search_query.strip() else ""
                 }
                 current_item_idx += 1
+    def toggle_autorun(active: bool, current_df: pd.DataFrame) -> dict:
+        """Toggle autorun state and UI elements"""
+        new_state = not active
+        updates = {
+            autorun_toggle: gr.Checkbox(value=new_state),
+            autorun_status: gr.Markdown(f"**Status:** {'Active' if new_state else 'Inactive'}"),
+            stop_btn: gr.Button(visible=new_state),
+            download_btn: gr.DownloadButton(visible=not new_state),
+            accumulated_datasets: current_df  # Maintain current state
+        }
+        if new_state:  # Reset when starting new run
+            updates[accumulated_datasets] = pd.DataFrame()
+        return updates
+    def autorun_iteration(
+        search_query: str,
+        current_df: pd.DataFrame,
+        compile_mode: str,
+        process_opts: List[str]
+    ) -> pd.DataFrame:
+        """Single iteration of autorun dataset generation"""
+        try:
+            new_data = generate_single_dataset(search_query)
+            processed = process_dataset(new_data, process_opts)
+            if compile_mode == "Combine All" and not current_df.empty:
+                combined = pd.concat([current_df, processed])
+                return combined
+            return processed
+        except Exception as e:
+            print(f"Error in autorun iteration: {e}")
+            return current_df
+    def create_download_file(current_df: pd.DataFrame) -> dict:
+        """Prepare dataset for download"""
+        timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+        filename = f"autorun-dataset-{timestamp}.csv"
+        current_df.to_csv(filename, index=False)
+        return gr.DownloadButton(label=f"Download {filename}", value=filename)
     def _show_dataset(search_query, dataset_name, tags):
         yield {
             search_page: gr.Column(visible=False),
     }
     """
+    # Add these event bindings
+    autorun_toggle.change(
+        toggle_autorun,
+        inputs=[autorun_active, accumulated_datasets],
+        outputs=[autorun_toggle, autorun_status, stop_btn, download_btn, accumulated_datasets]
+    )
+    stop_btn.click(
+        fn=lambda: [
+            gr.Checkbox(value=False),
+            gr.Markdown("**Status:** Inactive"),
+            gr.Button(visible=False),
+            gr.DownloadButton(visible=True)
+        ],
+        outputs=[autorun_toggle, autorun_status, stop_btn, download_btn]
+    )
+    download_btn.click(
+        create_download_file,
+        inputs=accumulated_datasets,
+        outputs=download_btn
+    )
     def show_dataset_from_button(search_query, *buttons_values, i):
         dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
         yield from _show_dataset(search_query, dataset_name, tags)
             yield {search_page: gr.Column(visible=True)}
+    def run_autorun():
+        while True:
+            if demo.autorun_active:
+                yield [
+                    autorun_iteration(
+                        demo.search_bar.value,
+                        demo.accumulated_datasets.value,
+                        demo.compile_mode.value,
+                        demo.processing_options.value
+                    ),
+                    gr.DataFrame(visible=True)
+                ]
+                time.sleep(AUTORUN_INTERVAL)
+            else:
+                yield [
+                    demo.accumulated_datasets.value,
+                    gr.DataFrame(visible=False)
+                ]
+demo.queue(concurrency_count=5).launch()