Infini-d-set

Runtime error

App Files Files Community

acecalisto3 commited on Dec 1, 2024

Commit

cc30771

verified ·

1 Parent(s): d938019

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -558

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ model_id = "microsoft/Phi-3-mini-4k-instruct"
 client = InferenceClient(model_id)
 save_dataset_hf_token = os.environ.get("SAVE_DATASET_HF_TOKEN")
-MAX_TOTAL_NB_ITEMS = 100  # almost infinite, don't judge me (actually it's because gradio needs a fixed number of components)
 MAX_NB_ITEMS_PER_GENERATION_CALL = 10
 NUM_ROWS = 100
 NUM_VARIANTS = 10
@@ -25,7 +25,7 @@ NAMESPACE = "infinite-dataset-hub"
 URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"
 GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
-    "A Machine Learning Practioner is looking for a dataset that matches '{search_query}'. "
     f"Generate a list of {MAX_NB_ITEMS_PER_GENERATION_CALL} names of quality datasets that don't exist but sound plausible and would "
     "be helpful. Feel free to reuse words from the query '{search_query}' to name the datasets. "
     "Every dataset should be about '{search_query}' and have descriptive tags/keywords including the ML task name associated with the dataset (classification, regression, anomaly detection, etc.). Use the following format:\n1. DatasetName1 (tag1, tag2, tag3)\n1. DatasetName2 (tag1, tag2, tag3)"
@@ -39,8 +39,6 @@ GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS = (
     "Reply using a short description of the dataset with title **Dataset Description:** followed by the CSV content in a code block and with title **CSV Content Preview:**."
 )
 GENERATE_MORE_ROWS = "Can you give me 10 additional samples in CSV format as well? Use the same CSV header '{csv_header}'."
 GENERATE_VARIANTS_WITH_RARITY_AND_LABEL = "Focus on generating samples for the label '{label}' and ideally generate {rarity} samples."
 GENERATE_VARIANTS_WITH_RARITY = "Focus on generating {rarity} samples."
@@ -71,8 +69,9 @@ landing_page_datasets_generated_text = """
 9. HealthVitalSigns (anomaly detection, biometrics, prediction)
 10. GameStockPredict (classification, finance, sports contingency)
 """
 default_output = landing_page_datasets_generated_text.strip().split("\n")
-assert default_output, "Output should not be empty."
 DATASET_CARD_CONTENT = """
 ---
@@ -172,31 +171,28 @@ a {
 """
 with gr.Blocks(css=css) as demo:
-    # Initialize state
     generated_texts_state = gr.State((landing_page_datasets_generated_text,))
     with gr.Column() as search_page:
         with gr.Row():
             with gr.Column(scale=10):
                 gr.Markdown(
-                    "# 🤗 inPHIni-D-set ♾️\n\n"
                     "An endless catalog of datasets, created just for you by an AI model.\n\n"
                 )
                 with gr.Row():
                     search_bar = gr.Textbox(
-                        max_lines=1,
-                        placeholder="Search datasets, get infinite results",
-                        show_label=False,
-                        container=False,
                         scale=9
                     )
                     search_button = gr.Button("🔍", variant="primary", scale=1)
-                # Initialize button groups and buttons
                 button_groups: list[gr.Group] = []
                 buttons: list[gr.Button] = []
-                # You'll need to define default_output before this loop
                 for i in range(MAX_TOTAL_NB_ITEMS):
                     if i < len(default_output):
                         line = default_output[i]
@@ -209,35 +205,32 @@ with gr.Blocks(css=css) as demo:
                         group_classes = "buttonsGroup insivibleButtonGroup"
                         dataset_name_classes = "topButton linear-background"
                         tags_classes = "bottomButton linear-background"
                     with gr.Group(elem_classes=group_classes) as button_group:
                         button_groups.append(button_group)
                         buttons.append(gr.Button(dataset_name, elem_classes=dataset_name_classes))
                         buttons.append(gr.Button(tags, elem_classes=tags_classes))
-                load_more_datasets = gr.Button("Load more datasets")  # TODO: disable when reaching end of page
                 gr.Markdown(f"_powered by [{model_id}](https://huggingface.co/{model_id})_")
-            with gr.Column(scale=4, min_width="200px"):
-                with gr.Accordion("Settings", open=False, elem_classes="settings"):
-                    gr.Markdown("Save datasets to your account")
-                    login_button = gr.LoginButton()
-                    login_button.activate() # This line fixes the warning
-                    select_namespace_dropdown = gr.Dropdown(
-                        choices=[NAMESPACE],
-                        value=NAMESPACE,
-                        label="Select user or organization",
-                        visible=False
-                    )
                     gr.Markdown("Save datasets as public or private datasets")
                     visibility_radio = gr.Radio(
-                        ["public", "private"],
-                        value="public",
-                        container=False,
                         interactive=False
                     )
     with gr.Column(visible=False) as dataset_page:
         gr.Markdown(
             "# 🤗 Infinite Dataset Hub ♾️\n\n"
@@ -254,527 +247,6 @@ with gr.Blocks(css=css) as demo:
         dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
         back_button = gr.Button("< Back", size="sm")
-###################################
-#
-#       Utils
-#
-###################################
-T = TypeVar("T")
-def batched(it: Iterable[T], n: int) -> Iterator[list[T]]:
-    """Batch iterator into chunks of size n."""
-    it = iter(it)
-    while batch := list(islice(it, n)):
-        yield batch
-def stream_response(msg: str, generated_texts: tuple[str] = (), max_tokens=500) -> Iterator[str]:
-    """Stream response from chat completion API."""
-    messages = [
-        {"role": "user", "content": msg}
-    ] + [
-        item
-        for generated_text in generated_texts
-        for item in [
-            {"role": "assistant", "content": generated_text},
-            {"role": "user", "content": "Can you generate more?"},
-        ]
-    ]
-    for _ in range(3):  # Retry logic
-        try:
-            for message in client.chat_completion(
-                messages=messages,
-                max_tokens=max_tokens,
-                stream=True,
-                top_p=0.8,
-                seed=42,
-            ):
-                yield message.choices[0].delta.content
-            break
-        except requests.exceptions.ConnectionError as e:
-            logger.warning(f"Connection error: {e}\nRetrying in 1sec")
-            time.sleep(1)
-def gen_datasets_line_by_line(search_query: str, generated_texts: tuple[str] = ()) -> Iterator[str]:
-    """Generate dataset names line by line based on search query."""
-    search_query = (search_query or "")[:1000].strip()
-    generated_text = ""
-    current_line = ""
-    for token in stream_response(
-        GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY.format(search_query=search_query),
-        generated_texts=generated_texts,
-    ):
-        current_line += token
-        if current_line.endswith("\n"):
-            yield current_line
-            generated_text += current_line
-            current_line = ""
-    if current_line:
-        yield current_line
-        generated_text += current_line
-    logger.debug(f"Generated text:\n{generated_text}")
-def gen_dataset_content(search_query: str, dataset_name: str, tags: str) -> Iterator[str]:
-    """Generate dataset content based on search query, name and tags."""
-    search_query = (search_query or "")[:1000].strip()
-    generated_text = ""
-    for token in stream_response(
-        GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
-            search_query=search_query,
-            dataset_name=dataset_name,
-            tags=tags,
-        ),
-        max_tokens=1500
-    ):
-        generated_text += token
-        yield generated_text
-    logger.debug(f"Generated content:\n{generated_text}")
-def _write_generator_to_queue(queue: Queue, func: Callable[..., Iterable], kwargs: dict) -> None:
-    """Helper function to write generator output to queue."""
-    try:
-        for result in func(**kwargs):
-            queue.put(result)
-    except Exception as e:
-        logger.error(f"Error in generator: {e}")
-        queue.put(None)
-def iflatmap_unordered(
-    func: Callable[..., Iterable[T]],
-    *,
-    kwargs_iterable: Iterable[dict],
-) -> Iterable[T]:
-    """Execute generator function with multiple kwargs in parallel."""
-    queue = Queue()
-    with ThreadPool() as pool:
-        async_results = [
-            pool.apply_async(_write_generator_to_queue, (queue, func, kwargs))
-            for kwargs in kwargs_iterable
-        ]
-        try:
-            while True:
-                try:
-                    result = queue.get(timeout=0.05)
-                    if result is not None:
-                        yield result
-                except Empty:
-                    if all(result.ready() for result in async_results) and queue.empty():
-                        break
-        finally:
-            for result in async_results:
-                try:
-                    result.get(timeout=0.05)
-                except Exception as e:
-                    logger.error(f"Async result error: {e}")
-def generate_partial_dataset(
-    title: str,
-    content: str,
-    search_query: str,
-    variant: str,
-    csv_header: str,
-    output: list[Dict[str, str]],
-    indices_to_generate: list[int],
-    max_tokens=1500
-) -> Iterator[int]:
-    """Generate partial dataset with specific variants."""
-    try:
-        dataset_name, tags = title.strip("# ").split("\ntags:", 1)
-        dataset_name, tags = dataset_name.strip(), tags.strip()
-        messages = [
-            {
-                "role": "user",
-                "content": GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
-                    dataset_name=dataset_name,
-                    tags=tags,
-                    search_query=search_query,
-                )
-            },
-            {"role": "assistant", "content": f"{title}\n\n{content}"},
-            {"role": "user", "content": f"{GENERATE_MORE_ROWS.format(csv_header=csv_header)} {variant}"},
-        ]
-        for response in _generate_dataset_rows(messages, max_tokens, indices_to_generate, output, csv_header):
-            yield response
-    except Exception as e:
-        logger.error(f"Error generating partial dataset: {e}")
-        yield 0
-def _generate_dataset_rows(messages: list, max_tokens: int, indices: list, output: list, csv_header: str) -> Iterator[int]:
-    """Helper function to generate dataset rows."""
-    for _ in range(3):  # Retry logic
-        try:
-            return _process_generation(
-                messages, max_tokens, indices, output, csv_header
-            )
-        except requests.exceptions.ConnectionError as e:
-            logger.warning(f"Connection error: {e}\nRetrying in 1sec")
-            time.sleep(1)
-    return iter([])
-def generate_variants(preview_df: pd.DataFrame) -> list[str]:
-    """Generate variants based on preview dataframe."""
-    label_candidate_columns = [
-        column for column in preview_df.columns
-        if "label" in column.lower()
-    ]
-    if label_candidate_columns:
-        labels = preview_df[label_candidate_columns[0]].unique()
-        if len(labels) > 1:
-            return [
-                GENERATE_VARIANTS_WITH_RARITY_AND_LABEL.format(
-                    rarity=rarity,
-                    label=label
-                )
-                for rarity in RARITIES
-                for label in labels
-            ]
-    return [
-        GENERATE_VARIANTS_WITH_RARITY.format(rarity=rarity)
-        for rarity in LONG_RARITIES
-    ]
-    ###################################
-    #
-    #       Buttons
-    #
-    ###################################
-    def _search_datasets(search_query):
-        yield {generated_texts_state: []}
-        yield {
-            button_group: gr.Group(elem_classes="buttonsGroup insivibleButtonGroup")
-            for button_group in button_groups[MAX_NB_ITEMS_PER_GENERATION_CALL:]
-        }
-        yield {
-            k: v
-            for dataset_name_button, tags_button in batched(buttons, 2)
-            for k, v in {
-                dataset_name_button: gr.Button("⬜⬜⬜⬜⬜⬜", elem_classes="topButton linear-background"),
-                tags_button: gr.Button("░░░░, ░░░░, ░░░░", elem_classes="bottomButton linear-background")
-            }.items()
-        }
-        current_item_idx = 0
-        generated_text = ""
-        for line in gen_datasets_line_by_line(search_query):
-            if "I'm sorry" in line or "against Microsoft's use case policy" in line:
-                raise gr.Error("Error: inappropriate content")
-            if current_item_idx >= MAX_NB_ITEMS_PER_GENERATION_CALL:
-                return
-            if line.strip() and line.strip().split(".", 1)[0].isnumeric():
-                try:
-                    dataset_name, tags = line.strip().split(".", 1)[1].strip(" )").split(" (", 1)
-                except ValueError:
-                    dataset_name, tags = line.strip().split(".", 1)[1].strip(" )").split(" ", 1)
-                dataset_name, tags = dataset_name.strip("()[]* "), tags.strip("()[]* ")
-                generated_text += line
-                yield {
-                    buttons[2 * current_item_idx]: gr.Button(dataset_name, elem_classes="topButton"),
-                    buttons[2 * current_item_idx + 1]: gr.Button(tags, elem_classes="bottomButton"),
-                    generated_texts_state: (generated_text,),
-                }
-                current_item_idx += 1
-    @search_button.click(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state])
-    def search_dataset_from_search_button(search_query):
-        yield from _search_datasets(search_query)
-    @search_bar.submit(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state])
-    def search_dataset_from_search_bar(search_query):
-        yield from _search_datasets(search_query)
-    @load_more_datasets.click(inputs=[search_bar, generated_texts_state], outputs=button_groups + buttons + [generated_texts_state])
-    def search_more_datasets(search_query, generated_texts):
-        current_item_idx = initial_item_idx = len(generated_texts) * MAX_NB_ITEMS_PER_GENERATION_CALL
-        yield {
-            button_group: gr.Group(elem_classes="buttonsGroup")
-            for button_group in button_groups[len(generated_texts) * MAX_NB_ITEMS_PER_GENERATION_CALL:(len(generated_texts) + 1) * MAX_NB_ITEMS_PER_GENERATION_CALL]
-        }
-        generated_text = ""
-        for line in gen_datasets_line_by_line(search_query, generated_texts=generated_texts):
-            if "I'm sorry" in line or "against Microsoft's use case policy" in line:
-                raise gr.Error("Error: inappropriate content")
-            if current_item_idx - initial_item_idx >= MAX_NB_ITEMS_PER_GENERATION_CALL:
-                return
-            if line.strip() and line.strip().split(".", 1)[0].isnumeric():
-                try:
-                    dataset_name, tags = line.strip().split(".", 1)[1].strip(" )").split(" (", 1)
-                except ValueError:
-                    dataset_name, tags = line.strip().split(".", 1)[1].strip(" )").split(" ", 1) [0], ""
-                dataset_name, tags = dataset_name.strip("()[]* "), tags.strip("()[]* ")
-                generated_text += line
-                yield {
-                    buttons[2 * current_item_idx]: gr.Button(dataset_name, elem_classes="topButton"),
-                    buttons[2 * current_item_idx + 1]: gr.Button(tags, elem_classes="bottomButton"),
-                    generated_texts_state: (*generated_texts, generated_text),
-                }
-                current_item_idx += 1
-    def _show_dataset(search_query, dataset_name, tags):
-        yield {
-            search_page: gr.Column(visible=False),
-            dataset_page: gr.Column(visible=True),
-            dataset_title: f"# {dataset_name}\n\n tags: {tags}",
-            dataset_share_textbox: gr.Textbox(visible=False),
-            dataset_dataframe: gr.DataFrame(visible=False),
-            generate_full_dataset_button: gr.Button(interactive=True),
-            save_dataset_button: gr.Button(visible=False),
-            open_dataset_message: gr.Markdown(visible=False)
-        }
-        for generated_text in gen_dataset_content(search_query=search_query, dataset_name=dataset_name, tags=tags):
-            yield {dataset_content: generated_text}
-    show_dataset_inputs = [search_bar, *buttons]
-    show_dataset_outputs = [search_page, dataset_page, dataset_title, dataset_content, generate_full_dataset_button, dataset_dataframe, save_dataset_button, open_dataset_message, dataset_share_textbox]
-    scroll_to_top_js = """
-    function (...args) {
-        console.log(args);
-        if ('parentIFrame' in window) {
-            window.parentIFrame.scrollTo({top: 0, behavior:'smooth'});
-        } else {
-            window.scrollTo({ top: 0 });
-        }
-        return args;
-    }
-    """
-    def show_dataset_from_button(search_query, *buttons_values, i):
-        dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
-        yield from _show_dataset(search_query, dataset_name, tags)
-    for i, (dataset_name_button, tags_button) in enumerate(batched(buttons, 2)):
-        dataset_name_button.click(partial(show_dataset_from_button, i=i), inputs=show_dataset_inputs, outputs=show_dataset_outputs, js=scroll_to_top_js)
-        tags_button.click(partial(show_dataset_from_button, i=i), inputs=show_dataset_inputs, outputs=show_dataset_outputs, js=scroll_to_top_js)
-    @back_button.click(outputs=[search_page, dataset_page], js=scroll_to_top_js)
-    def show_search_page():
-        return gr.Column(visible=True), gr.Column(visible=False)
-    @generate_full_dataset_button.click(inputs=[dataset_title, dataset_content, search_bar, select_namespace_dropdown, visibility_radio], outputs=[dataset_dataframe, generate_full_dataset_button, save_dataset_button])
-    def generate_full_dataset(title, content, search_query, namespace, visability):
-        dataset_name, tags = title.strip("# ").split("\ntags:", 1)
-        dataset_name, tags = dataset_name.strip(), tags.strip()
-        csv_header, preview_df = parse_preview_df(content)
-        # Remove dummy "id" columns
-        for column_name, values in preview_df.to_dict(orient="series").items():
-            try:
-                if [int(v) for v in values] == list(range(len(preview_df))):
-                    preview_df = preview_df.drop(columns=column_name)
-                if [int(v) for v in values] == list(range(1, len(preview_df) + 1)):
-                    preview_df = preview_df.drop(columns=column_name)
-            except Exception:
-                pass
-        columns = list(preview_df)
-        output: list[Optional[dict]] = [None] * NUM_ROWS
-        output[:len(preview_df)] = [{"idx": i, **x} for i, x in enumerate(preview_df.to_dict(orient="records"))]
-        yield {
-            dataset_dataframe: gr.DataFrame(pd.DataFrame([{"idx": i, **x} for i, x in enumerate(output) if x]), visible=True),
-            generate_full_dataset_button: gr.Button(interactive=False),
-            save_dataset_button: gr.Button(f"💾 Save Dataset {namespace}/{dataset_name}" + (" (private)" if visability != "public" else ""), visible=True, interactive=False)
-        }
-        kwargs_iterable = [
-            {
-                "title": title,
-                "content": content,
-                "search_query": search_query,
-                "variant": variant,
-                "csv_header": csv_header,
-                "output": output,
-                "indices_to_generate": list(range(len(preview_df) + i, NUM_ROWS, NUM_VARIANTS)),
-            }
-            for i, variant in enumerate(islice(generate_variants(preview_df), NUM_VARIANTS))
-        ]
-        for _ in iflatmap_unordered(generate_partial_dataset, kwargs_iterable=kwargs_iterable):
-            yield {dataset_dataframe: pd.DataFrame([{"idx": i, **{column_name: x.get(column_name) for column_name in columns}} for i, x in enumerate(output) if x])}
-        yield {save_dataset_button: gr.Button(interactive=True)}
-        print(f"Generated {dataset_name}!")
-    @save_dataset_button.click(inputs=[dataset_title, dataset_content, search_bar, dataset_dataframe, select_namespace_dropdown, visibility_radio], outputs=[save_dataset_button, open_dataset_message])
-    def save_dataset(title: str, content: str, search_query: str, df: pd.DataFrame, namespace: str, visability: str, oauth_token: Optional[gr.OAuthToken]):
-        dataset_name, tags = title.strip("# ").split("\ntags:", 1)
-        dataset_name, tags = dataset_name.strip(), tags.strip()
-        token = oauth_token.token if oauth_token else save_dataset_hf_token
-        repo_id = f"{namespace}/{dataset_name}"
-        dataset_url = f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}"
-        gr.Info("Saving dataset...")
-        yield {save_dataset_button: gr.Button(interactive=False)}
-        create_repo(repo_id=repo_id, repo_type="dataset", private=visability!="public", exist_ok=True, token=token)
-        df.to_csv(f"hf://datasets/{repo_id}/data.csv", storage_options={"token": token}, index=False)
-        DatasetCard(DATASET_CARD_CONTENT.format(title=title, content=content, url=URL, dataset_url=dataset_url, model_id=model_id, search_query=search_query)).push_to_hub(repo_id=repo_id, repo_type="dataset", token=token)
-        gr.Info(f"✅ Dataset saved at {repo_id}")
-        additional_message = "PS: You can also save datasets under your account in the Settings ;)"
-        yield {open_dataset_message: gr.Markdown(f"# 🎉 Yay ! Your dataset has been saved to [{repo_id}](https://huggingface.co/datasets/{repo_id}) !\n\nDataset link: [https://huggingface.co/datasets/{repo_id}](https://huggingface.co/datasets/{repo_id})\n\n{additional_message}", visible=True)}
-        print(f"Saved {dataset_name}!")
-    @dataset_share_button.click(inputs=[dataset_title, search_bar], outputs=[dataset_share_textbox])
-    def show_dataset_url(title, search_query):
-        dataset_name, tags = title.strip("# ").split("\ntags:", 1)
-        dataset_name, tags = dataset_name.strip(), tags.strip()
-        return gr.Textbox(
-            f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}",
-            visible=True,
-        )
-    @demo.load(outputs=[dataset_title, dataset_content, dataset_dataframe, search_bar])
-    def load_app(request: gr.Request, oauth_token: Optional[gr.OAuthToken]):
-        if oauth_token:
-            user_info = whoami(oauth_token.token)
-            yield {
-                select_namespace_dropdown: gr.Dropdown(
-                        choices=[user_info["name"]] + [org_info["name"] for org_info in user_info["orgs"]],
-                        value=user_info["name"],
-                        visible=True,
-                    ),
-                visibility_radio: gr.Radio(interactive=True),
-                }
-        query_params = dict(request.query_params)
-        if "dataset" in query_params:
-            yield from _show_dataset(
-                search_query=query_params.get("q", query_params["dataset"]),
-                dataset_name=query_params["dataset"],
-                tags=query_params.get("tags", "")
-            )
-        elif "q" in query_params:
-            yield {search_bar: query_params["q"]}
-            yield from _search_datasets(query_params["q"])
-        else:
-            # Default behavior
-            yield {file_uploader: gr.File(visible=True)}
-    @demo.upload(
-    inputs=[search_bar, dataset_title, dataset_content, dataset_dataframe, select_namespace_dropdown, visibility_radio],
-    outputs=[save_dataset_button, open_dataset_message]
-)
-    def upload_dataset(search_query, dataset_name, dataset_content, df, namespace, visibility):
-        # Parse dataset name and tags
-        dataset_name, tags = dataset_name.strip("# ").split("\ntags:", 1)
-        dataset_name, tags = dataset_name.strip(), tags.strip()
-        #Create local directory structure
-        base_dir = os.path.join(os.getcwd(), "datasets")
-        dataset_dir = os.path.join(base_dir, dataset_name)
-        os.makedirs(dataset_dir, exist_ok=True)
-        # Parse and clean preview dataframe
-        csv_header, preview_df = parse_preview_df(dataset_content)
-        # Remove dummy "id" columns
-        for column_name, values in preview_df.to_dict(orient="series").items():
-            try:
-                if [int(v) for v in values] == list(range(len(preview_df))):
-                    preview_df = preview_df.drop(columns=column_name)
-                if [int(v) for v in values] == list(range(1, len(preview_df) + 1)):
-                    preview_df = preview_df.drop(columns=column_name)
-            except Exception:
-                pass
-    columns = list(preview_df)
-    output: list[Optional[dict]] = [None] * NUM_ROWS
-    output[:len(preview_df)] = [{"idx": i, **x} for i, x in enumerate(preview_df.to_dict(orient="records"))]
-    # Update UI to show upload progress
-    yield {
-        save_dataset_button: gr.Button(
-            f"💾 Save Dataset {namespace}/{dataset_name}" +
-            (" (private)" if visibility != "public" else ""),
-            interactive=False
-        ),
-        open_dataset_message: gr.Markdown(f"Uploading dataset {dataset_name}...")
-    }
-    try:
-        # Get authentication token
-        token = oauth_token.token if oauth_token else save_dataset_hf_token
-        repo_id = f"{namespace}/{dataset_name}"
-        # Save files locally first
-        local_csv_path = os.path.join(dataset_dir, "data.csv")
-        local_readme_path = os.path.join(dataset_dir, "README.md")
-        # Save CSV locally
-        df.to_csv(local_csv_path, index=False)
-        # Create dataset card content
-        dataset_url = f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}"
-        dataset_card = DatasetCard(
-            DATASET_CARD_CONTENT.format(
-                title=title,
-                content=content,
-                url=URL,
-                dataset_url=dataset_url,
-                model_id=model_id,
-                search_query=search_query
-            )
-        )
-        # Save README locally
-        with open(local_readme_path, 'w', encoding='utf-8') as f:
-            f.write(str(dataset_card))
-        # Create and upload to Hub
-        create_repo(
-            repo_id=repo_id,
-            repo_type="dataset",
-            private=visibility != "public",
-            exist_ok=True,
-            token=token
-        )
-        # Upload files to Hub
-        df.to_csv(
-            f"hf://datasets/{repo_id}/data.csv",
-            storage_options={"token": token},
-            index=False
-        )
-        dataset_card.push_to_hub(
-            repo_id=repo_id,
-            repo_type="dataset",
-            token=token
-        )
-        # Show success message
-        gr.Info(f"✅ Dataset saved at {repo_id}")
-        additional_message = "PS: You can also save datasets under your account in the Settings ;)"
-        yield {
-            open_dataset_message: gr.Markdown(
-                f"# 🎉 Yay ! Your dataset has been saved to [{repo_id}](https://huggingface.co/datasets/{repo_id}) !\n\n"
-                f"Dataset link: [https://huggingface.co/datasets/{repo_id}](https://huggingface.co/datasets/{repo_id})\n\n"
-                f"{additional_message}",
-                visible=True
-            )
-        }
-        print(f"Saved {dataset_name}!")
-    except Exception as e:
-        print(f"Error saving dataset: {e}")
-        yield {
-            open_dataset_message: gr.Markdown(
-                f"❌ Error saving dataset: {str(e)}",
-                visible=True
-            )
-        }
-if __name__ == "__main__":
-    demo.launch()

 client = InferenceClient(model_id)
 save_dataset_hf_token = os.environ.get("SAVE_DATASET_HF_TOKEN")
+MAX_TOTAL_NB_ITEMS = 100
 MAX_NB_ITEMS_PER_GENERATION_CALL = 10
 NUM_ROWS = 100
 NUM_VARIANTS = 10
 URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"
 GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
+    "A Machine Learning Practitioner is looking for a dataset that matches '{search_query}'. "
     f"Generate a list of {MAX_NB_ITEMS_PER_GENERATION_CALL} names of quality datasets that don't exist but sound plausible and would "
     "be helpful. Feel free to reuse words from the query '{search_query}' to name the datasets. "
     "Every dataset should be about '{search_query}' and have descriptive tags/keywords including the ML task name associated with the dataset (classification, regression, anomaly detection, etc.). Use the following format:\n1. DatasetName1 (tag1, tag2, tag3)\n1. DatasetName2 (tag1, tag2, tag3)"
     "Reply using a short description of the dataset with title **Dataset Description:** followed by the CSV content in a code block and with title **CSV Content Preview:**."
 )
 GENERATE_MORE_ROWS = "Can you give me 10 additional samples in CSV format as well? Use the same CSV header '{csv_header}'."
 GENERATE_VARIANTS_WITH_RARITY_AND_LABEL = "Focus on generating samples for the label '{label}' and ideally generate {rarity} samples."
 GENERATE_VARIANTS_WITH_RARITY = "Focus on generating {rarity} samples."
 9. HealthVitalSigns (anomaly detection, biometrics, prediction)
 10. GameStockPredict (classification, finance, sports contingency)
 """
 default_output = landing_page_datasets_generated_text.strip().split("\n")
+assert len(default_output) == MAX_NB_ITEMS_PER_GENERATION_CALL
 DATASET_CARD_CONTENT = """
 ---
 """
 with gr.Blocks(css=css) as demo:
     generated_texts_state = gr.State((landing_page_datasets_generated_text,))
     with gr.Column() as search_page:
         with gr.Row():
             with gr.Column(scale=10):
                 gr.Markdown(
+                    "# 🤗 Infinite Dataset Hub ♾️\n\n"
                     "An endless catalog of datasets, created just for you by an AI model.\n\n"
                 )
                 with gr.Row():
                     search_bar = gr.Textbox(
+                        max_lines=1,
+                        placeholder="Search datasets, get infinite results",
+                        show_label=False,
+                        container=False,
                         scale=9
                     )
                     search_button = gr.Button("🔍", variant="primary", scale=1)
                 button_groups: list[gr.Group] = []
                 buttons: list[gr.Button] = []
                 for i in range(MAX_TOTAL_NB_ITEMS):
                     if i < len(default_output):
                         line = default_output[i]
                         group_classes = "buttonsGroup insivibleButtonGroup"
                         dataset_name_classes = "topButton linear-background"
                         tags_classes = "bottomButton linear-background"
                     with gr.Group(elem_classes=group_classes) as button_group:
                         button_groups.append(button_group)
                         buttons.append(gr.Button(dataset_name, elem_classes=dataset_name_classes))
                         buttons.append(gr.Button(tags, elem_classes=tags_classes))
+                load_more_datasets = gr.Button("Load more datasets")
                 gr.Markdown(f"_powered by [{model_id}](https://huggingface.co/{model_id})_")
+                with gr.Column(scale=4, min_width="200px"):
+                    with gr.Accordion("Settings", open=False, elem_classes="settings"):
+                        gr.Markdown("Save datasets to your account")
+                        gr.LoginButton()
+                        select_namespace_dropdown = gr.Dropdown(
+                            choices=[NAMESPACE],
+                            value=NAMESPACE,
+                            label="Select user or organization",
+                            visible=False
+                        )
                     gr.Markdown("Save datasets as public or private datasets")
                     visibility_radio = gr.Radio(
+                        ["public", "private"],
+                        value="public",
+                        container=False,
                         interactive=False
                     )
     with gr.Column(visible=False) as dataset_page:
         gr.Markdown(
             "# 🤗 Infinite Dataset Hub ♾️\n\n"
         dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
         back_button = gr.Button("< Back", size="sm")
+    # Define the remaining functions and event handlers...
+demo.launch()