Infini-d-set

Running

App Files Files Community

acecalisto3 commited on May 17

Commit

f48943e

verified ·

1 Parent(s): 0759604

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -213

app.py CHANGED Viewed

@@ -6,37 +6,30 @@ from itertools import islice
 from functools import partial
 from multiprocessing.pool import ThreadPool
 from queue import Queue, Empty
-from typing import Callable, Iterable, Iterator, Optional, TypeVar, List, Dict
-import datetime
 import gradio as gr
 import pandas as pd
 import requests.exceptions
 from huggingface_hub import InferenceClient, create_repo, whoami, DatasetCard
 model_id = "microsoft/Phi-3-mini-4k-instruct"
 client = InferenceClient(model_id)
 save_dataset_hf_token = os.environ.get("SAVE_DATASET_HF_TOKEN")
-AUTORUN_INTERVAL = 2  # Seconds between dataset generations
-MAX_AUTORUN_DATASETS = 1000  # Safety limit for infinite mode
 MAX_TOTAL_NB_ITEMS = 100  # almost infinite, don't judge me (actually it's because gradio needs a fixed number of components)
 MAX_NB_ITEMS_PER_GENERATION_CALL = 10
 NUM_ROWS = 100
-MAX_QUEUE_SIZE = 100  # Maximum number of concurrent users
 NUM_VARIANTS = 10
 NAMESPACE = "infinite-dataset-hub"
 URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"
-# Add these after existing state variables
-autorun_active = gr.State(False)
-accumulated_datasets = gr.State(pd.DataFrame())
-current_processing = gr.State(set())
 GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
-    "A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. "
-    f"Generate a list of {MAX_NB_ITEMS_PER_GENERATION_CALL} names of quality datasets that don't exist but sound plausible and would "
-    "be helpful. Feel free to reuse words from the query '{search_query}' to name the datasets. "
-    "Every dataset should be about '{search_query}' and have descriptive tags/keywords including the ML task name associated with the dataset (classification, regression, anomaly detection, etc.). Use the following format:\n1. DatasetName1 (tag1, tag2, tag3)\n1. DatasetName2 (tag1, tag2, tag3)"
 )
 GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS = (
@@ -56,7 +49,7 @@ LONG_RARITIES = [
     "expected",
     "common",
     "regular",
-    "unexpected but useful",  # <-- Added missing comma here.
     "original but useful",
     "specific but not far-fetched",
     "uncommon but still plausible",
@@ -86,34 +79,27 @@ tags:
 - infinite-dataset-hub
 - synthetic
 ---
 {title}
 _Note: This is an AI-generated dataset so its content may be inaccurate or false_
 {content}
 **Source of the data:**
 The dataset was generated using the [Infinite Dataset Hub]({url}) and {model_id} using the query '{search_query}':
 - **Dataset Generation Page**: {dataset_url}
 - **Model**: https://huggingface.co/{model_id}
 - **More Datasets**: https://huggingface.co/datasets?other=infinite-dataset-hub
 """
 css = """
-.autorun-section {
-    border: 1px solid var(--border-color-primary);
-    border-radius: 8px;
-    padding: 1rem;
-    margin-top: 1rem;
-}
-.compile-options {
-    margin-top: 1rem;
-}
-.download-prompt {
-    color: var(--color-accent);
-    font-weight: bold;
-    margin-top: 1rem;
-}
 a {
     color: var(--body-text-color);
 }
 .datasetButton {
     justify-content: start;
     justify-content: left;
@@ -163,6 +149,7 @@ a {
 .insivibleButtonGroup {
     display: none;
 }
 @keyframes placeHolderShimmer{
     0%{
         background-position: -468px 0
@@ -190,6 +177,7 @@ a {
 }
 """
 with gr.Blocks(css=css) as demo:
     generated_texts_state = gr.State((landing_page_datasets_generated_text,))
     with gr.Column() as search_page:
@@ -221,7 +209,7 @@ with gr.Blocks(css=css) as demo:
                         buttons.append(gr.Button(dataset_name, elem_classes=dataset_name_classes))
                         buttons.append(gr.Button(tags, elem_classes=tags_classes))
-                load_more_datasets = gr.Button("Load more datasets")  # TODO: disable when reaching end of page
                 gr.Markdown(f"_powered by [{model_id}](https://huggingface.co/{model_id})_")
             with gr.Column(scale=4, min_width="200px"):
                 with gr.Accordion("Settings", open=False, elem_classes="settings"):
@@ -246,25 +234,6 @@ with gr.Blocks(css=css) as demo:
         dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
         back_button = gr.Button("< Back", size="sm")
-    with gr.Column(elem_classes="autorun-section") as autorun_section:
-        with gr.Row():
-            autorun_toggle = gr.Checkbox(label="AutoRun Mode", interactive=True)
-            autorun_status = gr.Markdown("**Status:** Inactive", elem_classes="status")
-    with gr.Row():
-        compile_mode = gr.Radio(
-            ["Combine All", "Keep Separate"],
-            label="Compilation Mode",
-            value="Combine All"
-        )
-        processing_options = gr.CheckboxGroup(
-            ["Clean Data", "Chunk Data", "Summarize Data"],
-            label="Processing Options"
-        )
-    with gr.Row():
-        download_btn = gr.DownloadButton("Download Dataset", visible=False)
-        stop_btn = gr.Button("Stop & Save", variant="stop", visible=False)
     ###################################
     #
     #       Utils
@@ -278,6 +247,7 @@ with gr.Blocks(css=css) as demo:
         while batch := list(islice(it, n)):
             yield batch
     def stream_reponse(msg: str, generated_texts: tuple[str] = (), max_tokens=500) -> Iterator[str]:
         messages = [
             {"role": "user", "content": msg}
@@ -300,59 +270,11 @@ with gr.Blocks(css=css) as demo:
                 ):
                     yield message.choices[0].delta.content
             except requests.exceptions.ConnectionError as e:
-                print(f"{e}\n\nRetrying in 1sec")
                 time.sleep(1)
                 continue
             break
-    def generate_single_dataset(search_query: str) -> pd.DataFrame:
-        """Generate one complete dataset from search query to parsed DataFrame"""
-        # Generate dataset names
-        dataset_lines = []
-        for line in gen_datasets_line_by_line(search_query):
-            dataset_lines.append(line)
-            if len(dataset_lines) >= MAX_NB_ITEMS_PER_GENERATION_CALL:
-                break
-        # Process first valid dataset
-        for line in dataset_lines:
-            if line.strip() and line.strip().split(".", 1)[0].isnumeric():
-                try:
-                    dataset_name, tags = line.strip().split(".", 1)[1].strip(" )").split(" (", 1)
-                    break
-                except ValueError:
-                    continue
-        # Generate dataset content
-        content = ""
-        for token in gen_dataset_content(search_query, dataset_name, tags):
-            content += token
-        # Parse to DataFrame
-        _, preview_df = parse_preview_df(content)
-        return preview_df
-    def process_dataset(df: pd.DataFrame, options: List[str]) -> pd.DataFrame:
-        """Apply processing options to dataset"""
-        # Clean
-        if 'Clean Data' in options:
-            df = df.dropna().drop_duplicates()
-        # Chunk
-        if 'Chunk Data' in options:
-            if len(df) > 10:
-                df = df.sample(frac=0.5)  # Simple chunking example
-        # Summarize
-        if 'Summarize Data' in options:
-            summary = pd.DataFrame({
-                'columns': df.columns,
-                'dtypes': df.dtypes.values,
-                'non_null_count': df.count().values
-            })
-            return summary
-        return df
     def gen_datasets_line_by_line(search_query: str, generated_texts: tuple[str] = ()) -> Iterator[str]:
         search_query = search_query or ""
@@ -372,6 +294,7 @@ with gr.Blocks(css=css) as demo:
         generated_text += current_line
         print("-----\n\n" + generated_text)
     def gen_dataset_content(search_query: str, dataset_name: str, tags: str) -> Iterator[str]:
         search_query = search_query or ""
         search_query = search_query[:1000] if search_query.strip() else ""
@@ -385,11 +308,13 @@ with gr.Blocks(css=css) as demo:
             yield generated_text
         print("-----\n\n" + generated_text)
     def _write_generator_to_queue(queue: Queue, func: Callable[..., Iterable], kwargs: dict) -> None:
         for i, result in enumerate(func(**kwargs)):
             queue.put(result)
         return None
     def iflatmap_unordered(
         func: Callable[..., Iterable[T]],
         *,
@@ -411,6 +336,7 @@ with gr.Blocks(css=css) as demo:
                 # we get the result in case there's an error to raise
                 [async_result.get(timeout=0.05) for async_result in async_results]
     def generate_partial_dataset(title: str, content: str, search_query: str, variant: str, csv_header: str, output: list[dict[str, str]], indices_to_generate: list[int], max_tokens=1500) -> Iterator[int]:
         dataset_name, tags = title.strip("# ").split("\ntags:", 1)
         dataset_name, tags = dataset_name.strip(), tags.strip()
@@ -458,10 +384,14 @@ with gr.Blocks(css=css) as demo:
                                 pass
                         current_line = ""
             except requests.exceptions.ConnectionError as e:
-                print(f"{e}\n\nRetrying in 1sec")
                 time.sleep(1)
                 continue
             break
     def generate_variants(preview_df: pd.DataFrame):
         label_candidate_columns = [column for column in preview_df.columns if "label" in column.lower()]
@@ -478,6 +408,7 @@ with gr.Blocks(css=css) as demo:
             for rarity in LONG_RARITIES
         ]
     def parse_preview_df(content: str) -> tuple[str, pd.DataFrame]:
         _in_csv = False
         csv = "\n".join(
@@ -489,6 +420,7 @@ with gr.Blocks(css=css) as demo:
             raise gr.Error("Failed to parse CSV Preview")
         return csv.split("\n")[0], parse_csv_df(csv)
     def parse_csv_df(csv: str, csv_header: Optional[str] = None) -> pd.DataFrame:
         # Fix generation mistake when providing a list that is not in quotes
         for match in re.finditer(r'''(?!")\[(["'][\w ]+["'][, ]*)+\](?!")''', csv):
@@ -501,12 +433,14 @@ with gr.Blocks(css=css) as demo:
         df = pd.read_csv(io.StringIO(csv), skipinitialspace=True)
         return df
     ###################################
     #
     #       Buttons
     #
     ###################################
     def _search_datasets(search_query):
         yield {generated_texts_state: []}
         yield {
@@ -542,14 +476,17 @@ with gr.Blocks(css=css) as demo:
                 }
                 current_item_idx += 1
     @search_button.click(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state])
     def search_dataset_from_search_button(search_query):
         yield from _search_datasets(search_query)
     @search_bar.submit(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state])
     def search_dataset_from_search_bar(search_query):
         yield from _search_datasets(search_query)
     @load_more_datasets.click(inputs=[search_bar, generated_texts_state], outputs=button_groups + buttons + [generated_texts_state])
     def search_more_datasets(search_query, generated_texts):
         current_item_idx = initial_item_idx = len(generated_texts) * MAX_NB_ITEMS_PER_GENERATION_CALL
@@ -567,7 +504,7 @@ with gr.Blocks(css=css) as demo:
                 try:
                     dataset_name, tags = line.strip().split(".", 1)[1].strip(" )").split(" (", 1)
                 except ValueError:
-                    dataset_name, tags = line.strip().split(".", 1)[1].strip(" )").split(" ", 1)[0], ""
                 dataset_name, tags = dataset_name.strip("()[]* "), tags.strip("()[]* ")
                 generated_text += line
                 yield {
@@ -577,66 +514,21 @@ with gr.Blocks(css=css) as demo:
                 }
                 current_item_idx += 1
-    def toggle_autorun(active: bool, current_df: pd.DataFrame) -> dict:
-        """Toggle autorun state and UI elements"""
-        new_state = not active
-        updates = {
-            autorun_toggle: gr.Checkbox.update(value=new_state),
-            autorun_status: gr.Markdown.update(value=f"**Status:** {'Active' if new_state else 'Inactive'}"),
-            stop_btn: gr.Button.update(visible=new_state),
-            download_btn: gr.DownloadButton.update(visible=not new_state),
-            accumulated_datasets: current_df  # Maintain current state
-        }
-        if new_state:  # Reset when starting new run
-            updates[accumulated_datasets] = pd.DataFrame()
-        return updates
-    def autorun_iteration(
-        search_query: str,
-        current_df: pd.DataFrame,
-        compile_mode: str,
-        process_opts: List[str]
-    ) -> pd.DataFrame:
-        """Single iteration of autorun dataset generation"""
-        try:
-            new_data = generate_single_dataset(search_query)
-            processed = process_dataset(new_data, process_opts)
-            if compile_mode == "Combine All":
-                combined = pd.concat([current_df, processed], ignore_index=True)
-                # Return full dataset but only show last 50
-                return combined
-            else:
-                return pd.concat([current_df, processed], ignore_index=True)
-        except Exception as e:
-            print(f"Error in autorun iteration: {e}")
-            return current_df
-    def create_download_file(current_df: pd.DataFrame) -> str:
-        """Prepare dataset for download; returns the filename"""
-        timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
-        filename = f"autorun-dataset-{timestamp}.csv"
-        current_df.to_csv(filename, index=False)
-        return filename
-    # Helper function to update the displayed dataframe (showing last 50 rows)
-    def update_display(df: pd.DataFrame) -> pd.DataFrame:
-        return df.tail(50)
     def _show_dataset(search_query, dataset_name, tags):
         yield {
-            search_page: gr.Column.update(visible=False),
-            dataset_page: gr.Column.update(visible=True),
             dataset_title: f"# {dataset_name}\n\n tags: {tags}",
-            dataset_share_textbox: gr.Textbox.update(visible=False),
-            dataset_dataframe: gr.DataFrame.update(visible=False),
-            generate_full_dataset_button: gr.Button.update(interactive=True),
-            save_dataset_button: gr.Button.update(visible=False),
-            open_dataset_message: gr.Markdown.update(visible=False)
         }
         for generated_text in gen_dataset_content(search_query=search_query, dataset_name=dataset_name, tags=tags):
             yield {dataset_content: generated_text}
     show_dataset_inputs = [search_bar, *buttons]
     show_dataset_outputs = [search_page, dataset_page, dataset_title, dataset_content, generate_full_dataset_button, dataset_dataframe, save_dataset_button, open_dataset_message, dataset_share_textbox]
     scroll_to_top_js = """
@@ -651,33 +543,6 @@ with gr.Blocks(css=css) as demo:
     }
     """
-    # Function to update UI when stopping autorun
-    def stop_autorun():
-        return (
-            gr.Checkbox.update(value=False),
-            gr.Markdown.update(value="**Status:** Inactive"),
-            gr.Button.update(visible=False),
-            gr.DownloadButton.update(visible=True)
-        )
-    autorun_toggle.change(
-        toggle_autorun,
-        inputs=[autorun_active, accumulated_datasets],
-        outputs=[autorun_toggle, autorun_status, stop_btn, download_btn, accumulated_datasets]
-    )
-    stop_btn.click(
-        stop_autorun,
-        inputs=None,
-        outputs=[autorun_toggle, autorun_status, stop_btn, download_btn]
-    )
-    download_btn.click(
-        create_download_file,
-        inputs=accumulated_datasets,
-        outputs=download_btn
-    )
     def show_dataset_from_button(search_query, *buttons_values, i):
         dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
         yield from _show_dataset(search_query, dataset_name, tags)
@@ -686,10 +551,12 @@ with gr.Blocks(css=css) as demo:
         dataset_name_button.click(partial(show_dataset_from_button, i=i), inputs=show_dataset_inputs, outputs=show_dataset_outputs, js=scroll_to_top_js)
         tags_button.click(partial(show_dataset_from_button, i=i), inputs=show_dataset_inputs, outputs=show_dataset_outputs, js=scroll_to_top_js)
     @back_button.click(outputs=[search_page, dataset_page], js=scroll_to_top_js)
     def show_search_page():
         return gr.Column(visible=True), gr.Column(visible=False)
     @generate_full_dataset_button.click(inputs=[dataset_title, dataset_content, search_bar, select_namespace_dropdown, visibility_radio], outputs=[dataset_dataframe, generate_full_dataset_button, save_dataset_button])
     def generate_full_dataset(title, content, search_query, namespace, visability):
         dataset_name, tags = title.strip("# ").split("\ntags:", 1)
@@ -709,8 +576,8 @@ with gr.Blocks(css=css) as demo:
         output[:len(preview_df)] = [{"idx": i, **x} for i, x in enumerate(preview_df.to_dict(orient="records"))]
         yield {
             dataset_dataframe: gr.DataFrame(pd.DataFrame([{"idx": i, **x} for i, x in enumerate(output) if x]), visible=True),
-            generate_full_dataset_button: gr.Button.update(interactive=False),
-            save_dataset_button: gr.Button.update(label=f"💾 Save Dataset {namespace}/{dataset_name}" + (" (private)" if visability != "public" else ""), visible=True, interactive=False)
         }
         kwargs_iterable = [
             {
@@ -726,9 +593,10 @@ with gr.Blocks(css=css) as demo:
         ]
         for _ in iflatmap_unordered(generate_partial_dataset, kwargs_iterable=kwargs_iterable):
             yield {dataset_dataframe: pd.DataFrame([{"idx": i, **{column_name: x.get(column_name) for column_name in columns}} for i, x in enumerate(output) if x])}
-        yield {save_dataset_button: gr.Button.update(interactive=True)}
         print(f"Generated {dataset_name}!")
     @save_dataset_button.click(inputs=[dataset_title, dataset_content, search_bar, dataset_dataframe, select_namespace_dropdown, visibility_radio], outputs=[save_dataset_button, open_dataset_message])
     def save_dataset(title: str, content: str, search_query: str, df: pd.DataFrame, namespace: str, visability: str, oauth_token: Optional[gr.OAuthToken]):
         dataset_name, tags = title.strip("# ").split("\ntags:", 1)
@@ -737,32 +605,36 @@ with gr.Blocks(css=css) as demo:
         repo_id = f"{namespace}/{dataset_name}"
         dataset_url = f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}"
         gr.Info("Saving dataset...")
-        yield {save_dataset_button: gr.Button.update(interactive=False)}
         create_repo(repo_id=repo_id, repo_type="dataset", private=visability!="public", exist_ok=True, token=token)
         df.to_csv(f"hf://datasets/{repo_id}/data.csv", storage_options={"token": token}, index=False)
         DatasetCard(DATASET_CARD_CONTENT.format(title=title, content=content, url=URL, dataset_url=dataset_url, model_id=model_id, search_query=search_query)).push_to_hub(repo_id=repo_id, repo_type="dataset", token=token)
         gr.Info(f"✅ Dataset saved at {repo_id}")
         additional_message = "PS: You can also save datasets under your account in the Settings ;)"
-        yield {open_dataset_message: gr.Markdown.update(value=f"# 🎉 Yay ! Your dataset has been saved to [{repo_id}](https://huggingface.co/datasets/{repo_id}) !\n\nDataset link: [https://huggingface.co/datasets/{repo_id}](https://huggingface.co/datasets/{repo_id})\n\n{additional_message}", visible=True)}
         print(f"Saved {dataset_name}!")
     @dataset_share_button.click(inputs=[dataset_title, search_bar], outputs=[dataset_share_textbox])
     def show_dataset_url(title, search_query):
         dataset_name, tags = title.strip("# ").split("\ntags:", 1)
         dataset_name, tags = dataset_name.strip(), tags.strip()
-        return gr.Textbox.update(value=f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}", visible=True)
     @demo.load(outputs=show_dataset_outputs + button_groups + buttons + [generated_texts_state] + [select_namespace_dropdown, visibility_radio])
     def load_app(request: gr.Request, oauth_token: Optional[gr.OAuthToken]):
         if oauth_token:
             user_info = whoami(oauth_token.token)
             yield {
-                select_namespace_dropdown: gr.Dropdown.update(
                         choices=[user_info["name"]] + [org_info["name"] for org_info in user_info["orgs"]],
                         value=user_info["name"],
                         visible=True,
                     ),
-                visibility_radio: gr.Radio.update(interactive=True),
                 }
         query_params = dict(request.query_params)
         if "dataset" in query_params:
@@ -777,30 +649,5 @@ with gr.Blocks(css=css) as demo:
         else:
             yield {search_page: gr.Column(visible=True)}
-    def run_autorun():
-        while True:
-            # Using the value from autorun_active state
-            if autorun_active.value:
-                # Update full dataset
-                full_data = autorun_iteration(
-                    search_bar.value,
-                    accumulated_datasets.value,
-                    compile_mode.value,
-                    processing_options.value
-                )
-                accumulated_display = gr.DataFrame(
-                    label="Accumulated Data (Last 50 Samples)",
-                    interactive=False,
-                    wrap=True
-                )
-                # Update state with full data and show last 50 rows
-                accumulated_datasets.value = full_data
-                yield {
-                    accumulated_display: update_display(full_data),
-                    accumulated_datasets: full_data
-                }
-                time.sleep(AUTORUN_INTERVAL)
-            else:
-                yield accumulated_display.update(visible=False)
-demo.queue(max_size=100).launch(share=True)

 from functools import partial
 from multiprocessing.pool import ThreadPool
 from queue import Queue, Empty
+from typing import Callable, Iterable, Iterator, Optional, TypeVar
 import gradio as gr
 import pandas as pd
 import requests.exceptions
 from huggingface_hub import InferenceClient, create_repo, whoami, DatasetCard
 model_id = "microsoft/Phi-3-mini-4k-instruct"
 client = InferenceClient(model_id)
 save_dataset_hf_token = os.environ.get("SAVE_DATASET_HF_TOKEN")
 MAX_TOTAL_NB_ITEMS = 100  # almost infinite, don't judge me (actually it's because gradio needs a fixed number of components)
 MAX_NB_ITEMS_PER_GENERATION_CALL = 10
 NUM_ROWS = 100
 NUM_VARIANTS = 10
 NAMESPACE = "infinite-dataset-hub"
 URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"
 GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
+        "A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. "
+        f"Generate a list of {MAX_NB_ITEMS_PER_GENERATION_CALL} names of quality datasets that don't exist but sound plausible and would "
+        "be helpful. Feel free to reuse words from the query '{search_query}' to name the datasets. "
+        "Every dataset should be about '{search_query}' and have descriptive tags/keywords including the ML task name associated with the dataset (classification, regression, anomaly detection, etc.). Use the following format:\n1. DatasetName1 (tag1, tag2, tag3)\n1. DatasetName2 (tag1, tag2, tag3)"
 )
 GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS = (
     "expected",
     "common",
     "regular",
+    "unexpected but useful"
     "original but useful",
     "specific but not far-fetched",
     "uncommon but still plausible",
 - infinite-dataset-hub
 - synthetic
 ---
 {title}
 _Note: This is an AI-generated dataset so its content may be inaccurate or false_
 {content}
 **Source of the data:**
 The dataset was generated using the [Infinite Dataset Hub]({url}) and {model_id} using the query '{search_query}':
 - **Dataset Generation Page**: {dataset_url}
 - **Model**: https://huggingface.co/{model_id}
 - **More Datasets**: https://huggingface.co/datasets?other=infinite-dataset-hub
 """
 css = """
 a {
     color: var(--body-text-color);
 }
 .datasetButton {
     justify-content: start;
     justify-content: left;
 .insivibleButtonGroup {
     display: none;
 }
 @keyframes placeHolderShimmer{
     0%{
         background-position: -468px 0
 }
 """
 with gr.Blocks(css=css) as demo:
     generated_texts_state = gr.State((landing_page_datasets_generated_text,))
     with gr.Column() as search_page:
                         buttons.append(gr.Button(dataset_name, elem_classes=dataset_name_classes))
                         buttons.append(gr.Button(tags, elem_classes=tags_classes))
+                load_more_datasets = gr.Button("Load more datasets")  # TODO: dosable when reaching end of page
                 gr.Markdown(f"_powered by [{model_id}](https://huggingface.co/{model_id})_")
             with gr.Column(scale=4, min_width="200px"):
                 with gr.Accordion("Settings", open=False, elem_classes="settings"):
         dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
         back_button = gr.Button("< Back", size="sm")
     ###################################
     #
     #       Utils
         while batch := list(islice(it, n)):
             yield batch
     def stream_reponse(msg: str, generated_texts: tuple[str] = (), max_tokens=500) -> Iterator[str]:
         messages = [
             {"role": "user", "content": msg}
                 ):
                     yield message.choices[0].delta.content
             except requests.exceptions.ConnectionError as e:
+                print(e + "\n\nRetrying in 1sec")
                 time.sleep(1)
                 continue
             break
     def gen_datasets_line_by_line(search_query: str, generated_texts: tuple[str] = ()) -> Iterator[str]:
         search_query = search_query or ""
         generated_text += current_line
         print("-----\n\n" + generated_text)
     def gen_dataset_content(search_query: str, dataset_name: str, tags: str) -> Iterator[str]:
         search_query = search_query or ""
         search_query = search_query[:1000] if search_query.strip() else ""
             yield generated_text
         print("-----\n\n" + generated_text)
     def _write_generator_to_queue(queue: Queue, func: Callable[..., Iterable], kwargs: dict) -> None:
         for i, result in enumerate(func(**kwargs)):
             queue.put(result)
         return None
     def iflatmap_unordered(
         func: Callable[..., Iterable[T]],
         *,
                 # we get the result in case there's an error to raise
                 [async_result.get(timeout=0.05) for async_result in async_results]
     def generate_partial_dataset(title: str, content: str, search_query: str, variant: str, csv_header: str, output: list[dict[str, str]], indices_to_generate: list[int], max_tokens=1500) -> Iterator[int]:
         dataset_name, tags = title.strip("# ").split("\ntags:", 1)
         dataset_name, tags = dataset_name.strip(), tags.strip()
                                 pass
                         current_line = ""
             except requests.exceptions.ConnectionError as e:
+                print(e + "\n\nRetrying in 1sec")
                 time.sleep(1)
                 continue
             break
+        # for debugging
+        # with open(f".output{indices_to_generate[0]}.txt", "w") as f:
+        #     f.write(generated_text)
     def generate_variants(preview_df: pd.DataFrame):
         label_candidate_columns = [column for column in preview_df.columns if "label" in column.lower()]
             for rarity in LONG_RARITIES
         ]
     def parse_preview_df(content: str) -> tuple[str, pd.DataFrame]:
         _in_csv = False
         csv = "\n".join(
             raise gr.Error("Failed to parse CSV Preview")
         return csv.split("\n")[0], parse_csv_df(csv)
     def parse_csv_df(csv: str, csv_header: Optional[str] = None) -> pd.DataFrame:
         # Fix generation mistake when providing a list that is not in quotes
         for match in re.finditer(r'''(?!")\[(["'][\w ]+["'][, ]*)+\](?!")''', csv):
         df = pd.read_csv(io.StringIO(csv), skipinitialspace=True)
         return df
     ###################################
     #
     #       Buttons
     #
     ###################################
     def _search_datasets(search_query):
         yield {generated_texts_state: []}
         yield {
                 }
                 current_item_idx += 1
     @search_button.click(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state])
     def search_dataset_from_search_button(search_query):
         yield from _search_datasets(search_query)
     @search_bar.submit(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state])
     def search_dataset_from_search_bar(search_query):
         yield from _search_datasets(search_query)
     @load_more_datasets.click(inputs=[search_bar, generated_texts_state], outputs=button_groups + buttons + [generated_texts_state])
     def search_more_datasets(search_query, generated_texts):
         current_item_idx = initial_item_idx = len(generated_texts) * MAX_NB_ITEMS_PER_GENERATION_CALL
                 try:
                     dataset_name, tags = line.strip().split(".", 1)[1].strip(" )").split(" (", 1)
                 except ValueError:
+                    dataset_name, tags = line.strip().split(".", 1)[1].strip(" )").split(" ", 1) [0], ""
                 dataset_name, tags = dataset_name.strip("()[]* "), tags.strip("()[]* ")
                 generated_text += line
                 yield {
                 }
                 current_item_idx += 1
     def _show_dataset(search_query, dataset_name, tags):
         yield {
+            search_page: gr.Column(visible=False),
+            dataset_page: gr.Column(visible=True),
             dataset_title: f"# {dataset_name}\n\n tags: {tags}",
+            dataset_share_textbox: gr.Textbox(visible=False),
+            dataset_dataframe: gr.DataFrame(visible=False),
+            generate_full_dataset_button: gr.Button(interactive=True),
+            save_dataset_button: gr.Button(visible=False),
+            open_dataset_message: gr.Markdown(visible=False)
         }
         for generated_text in gen_dataset_content(search_query=search_query, dataset_name=dataset_name, tags=tags):
             yield {dataset_content: generated_text}
     show_dataset_inputs = [search_bar, *buttons]
     show_dataset_outputs = [search_page, dataset_page, dataset_title, dataset_content, generate_full_dataset_button, dataset_dataframe, save_dataset_button, open_dataset_message, dataset_share_textbox]
     scroll_to_top_js = """
     }
     """
     def show_dataset_from_button(search_query, *buttons_values, i):
         dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
         yield from _show_dataset(search_query, dataset_name, tags)
         dataset_name_button.click(partial(show_dataset_from_button, i=i), inputs=show_dataset_inputs, outputs=show_dataset_outputs, js=scroll_to_top_js)
         tags_button.click(partial(show_dataset_from_button, i=i), inputs=show_dataset_inputs, outputs=show_dataset_outputs, js=scroll_to_top_js)
     @back_button.click(outputs=[search_page, dataset_page], js=scroll_to_top_js)
     def show_search_page():
         return gr.Column(visible=True), gr.Column(visible=False)
     @generate_full_dataset_button.click(inputs=[dataset_title, dataset_content, search_bar, select_namespace_dropdown, visibility_radio], outputs=[dataset_dataframe, generate_full_dataset_button, save_dataset_button])
     def generate_full_dataset(title, content, search_query, namespace, visability):
         dataset_name, tags = title.strip("# ").split("\ntags:", 1)
         output[:len(preview_df)] = [{"idx": i, **x} for i, x in enumerate(preview_df.to_dict(orient="records"))]
         yield {
             dataset_dataframe: gr.DataFrame(pd.DataFrame([{"idx": i, **x} for i, x in enumerate(output) if x]), visible=True),
+            generate_full_dataset_button: gr.Button(interactive=False),
+            save_dataset_button: gr.Button(f"💾 Save Dataset {namespace}/{dataset_name}" + (" (private)" if visability != "public" else ""), visible=True, interactive=False)
         }
         kwargs_iterable = [
             {
         ]
         for _ in iflatmap_unordered(generate_partial_dataset, kwargs_iterable=kwargs_iterable):
             yield {dataset_dataframe: pd.DataFrame([{"idx": i, **{column_name: x.get(column_name) for column_name in columns}} for i, x in enumerate(output) if x])}
+        yield {save_dataset_button: gr.Button(interactive=True)}
         print(f"Generated {dataset_name}!")
     @save_dataset_button.click(inputs=[dataset_title, dataset_content, search_bar, dataset_dataframe, select_namespace_dropdown, visibility_radio], outputs=[save_dataset_button, open_dataset_message])
     def save_dataset(title: str, content: str, search_query: str, df: pd.DataFrame, namespace: str, visability: str, oauth_token: Optional[gr.OAuthToken]):
         dataset_name, tags = title.strip("# ").split("\ntags:", 1)
         repo_id = f"{namespace}/{dataset_name}"
         dataset_url = f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}"
         gr.Info("Saving dataset...")
+        yield {save_dataset_button: gr.Button(interactive=False)}
         create_repo(repo_id=repo_id, repo_type="dataset", private=visability!="public", exist_ok=True, token=token)
         df.to_csv(f"hf://datasets/{repo_id}/data.csv", storage_options={"token": token}, index=False)
         DatasetCard(DATASET_CARD_CONTENT.format(title=title, content=content, url=URL, dataset_url=dataset_url, model_id=model_id, search_query=search_query)).push_to_hub(repo_id=repo_id, repo_type="dataset", token=token)
         gr.Info(f"✅ Dataset saved at {repo_id}")
         additional_message = "PS: You can also save datasets under your account in the Settings ;)"
+        yield {open_dataset_message: gr.Markdown(f"# 🎉 Yay ! Your dataset has been saved to [{repo_id}](https://huggingface.co/datasets/{repo_id}) !\n\nDataset link: [https://huggingface.co/datasets/{repo_id}](https://huggingface.co/datasets/{repo_id})\n\n{additional_message}", visible=True)}
         print(f"Saved {dataset_name}!")
     @dataset_share_button.click(inputs=[dataset_title, search_bar], outputs=[dataset_share_textbox])
     def show_dataset_url(title, search_query):
         dataset_name, tags = title.strip("# ").split("\ntags:", 1)
         dataset_name, tags = dataset_name.strip(), tags.strip()
+        return gr.Textbox(
+            f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}",
+            visible=True,
+        )
     @demo.load(outputs=show_dataset_outputs + button_groups + buttons + [generated_texts_state] + [select_namespace_dropdown, visibility_radio])
     def load_app(request: gr.Request, oauth_token: Optional[gr.OAuthToken]):
         if oauth_token:
             user_info = whoami(oauth_token.token)
             yield {
+                select_namespace_dropdown: gr.Dropdown(
                         choices=[user_info["name"]] + [org_info["name"] for org_info in user_info["orgs"]],
                         value=user_info["name"],
                         visible=True,
                     ),
+                visibility_radio: gr.Radio(interactive=True),
                 }
         query_params = dict(request.query_params)
         if "dataset" in query_params:
         else:
             yield {search_page: gr.Column(visible=True)}
+demo.launch()