acecalisto3 commited on
Commit
8d920b7
·
verified ·
1 Parent(s): f4cda3f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +262 -65
app.py CHANGED
@@ -348,10 +348,20 @@ def stream_response(msg: str, history: list[Dict[str, str]] = [], max_tokens=500
348
  print(f"Unexpected LLM error (attempt {attempt+1}): {e}. Retrying...")
349
  time.sleep(2**attempt)
350
 
351
- def generate_dataset_names(search_query: str, history: list[Dict[str, str]]) -> Iterator[str]:
352
  """Generates dataset names based on a search query using the LLM."""
353
  query = search_query[:1000] if search_query else ""
354
- prompt = GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY.format(search_query=query)
 
 
 
 
 
 
 
 
 
 
355
 
356
  full_response = ""
357
  for token in stream_response(prompt, history):
@@ -362,12 +372,23 @@ def generate_dataset_names(search_query: str, history: list[Dict[str, str]]) ->
362
  history.append({"role": "assistant", "content": full_response}) # Update history
363
  # No return needed as history is modified in place
364
 
365
- def generate_dataset_content(search_query: str, dataset_name: str, tags: str, history: list[Dict[str, str]]) -> Iterator[str]:
366
  """Generates the description and CSV preview for a dataset."""
367
  query = search_query[:1000] if search_query else ""
368
- prompt = GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
369
- search_query=query, dataset_name=dataset_name, tags=tags
370
- )
 
 
 
 
 
 
 
 
 
 
 
371
 
372
  full_response = ""
373
  for token in stream_response(prompt, history):
@@ -411,15 +432,33 @@ def iflatmap_unordered(func: Callable, kwargs_iterable: Iterable[dict]) -> Itera
411
 
412
  def generate_partial_dataset(
413
  title: str, content: str, search_query: str, variant: str, csv_header: str,
414
- output: list[Optional[dict]], indices_to_generate: list[int], history: list[Dict[str, str]]
 
415
  ) -> Iterator[int]:
416
  """Generates a batch of dataset rows for a specific variant."""
417
  dataset_name, tags = title.strip("# ").split("\ntags:", 1)
418
  dataset_name, tags = dataset_name.strip(), tags.strip()
419
 
420
  prompt = GENERATE_MORE_ROWS.format(csv_header=csv_header) + " " + variant
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  messages = [
422
- {"role": "user", "content": GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(search_query=search_query, dataset_name=dataset_name, tags=tags)},
423
  {"role": "assistant", "content": title + "\n\n" + content},
424
  {"role": "user", "content": prompt},
425
  ]
@@ -531,6 +570,11 @@ def get_repo_visibility(repo_id: str, token: str) -> str:
531
 
532
  with gr.Blocks(css=css) as demo:
533
  generated_texts_state = gr.State((landing_page_datasets_generated_text,)) # State for generated dataset names
 
 
 
 
 
534
 
535
  # --- Search Page UI ---
536
  with gr.Column(visible=True, elem_id="search-page") as search_page:
@@ -591,24 +635,50 @@ with gr.Blocks(css=css) as demo:
591
  label="Dataset Visibility", info="Set visibility for datasets saved to Hugging Face Hub."
592
  )
593
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
594
  # --- Dataset Detail Page UI ---
595
  with gr.Column(visible=False, elem_id="dataset-page") as dataset_page:
596
  gr.Markdown("# 🤗 Infinite Dataset Hub ♾️\n\nAn endless catalog of datasets, created just for you.")
597
- dataset_title = gr.Markdown() # Dataset name and tags
598
- gr.Markdown("_Note: This is an AI-generated dataset so its content may be inaccurate or false_")
599
- dataset_content = gr.Markdown() # Description and CSV preview
600
- generate_full_dataset_button = gr.Button("Generate Full Dataset", variant="primary")
601
- dataset_dataframe = gr.DataFrame(visible=False, interactive=False, wrap=True) # Holds the full generated dataset
602
- save_dataset_button = gr.Button("💾 Save Dataset", variant="primary", visible=False)
 
 
 
 
603
  open_dataset_message = gr.Markdown("", visible=False) # Confirmation message
604
  dataset_share_button = gr.Button("Share Dataset URL")
605
  dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
 
 
 
 
 
 
 
 
606
  back_button = gr.Button("< Back", size="sm")
607
 
608
  # --- Event Handlers ---
609
 
610
  # Search Logic
611
- def _update_search_results(search_query: str, current_generated_texts: tuple[str]):
612
  """Handles dataset search and UI updates."""
613
  # Reset UI to loading state
614
  yield {btn: gr.Button("⬜⬜⬜⬜⬜⬜", elem_classes="topButton linear-background") for btn in buttons[::2]}
@@ -620,7 +690,7 @@ with gr.Blocks(css=css) as demo:
620
 
621
  try:
622
  # Generate dataset names from LLM
623
- for line in generate_dataset_names(search_query, []):
624
  if "I'm sorry" in line or "policy" in line: raise gr.Error("Inappropriate content detected.")
625
  if generated_count >= MAX_NB_ITEMS_PER_GENERATION_CALL: break
626
 
@@ -646,58 +716,106 @@ with gr.Blocks(css=css) as demo:
646
  except Exception as e: raise gr.Error(f"Failed to generate datasets: {str(e)}")
647
 
648
  # Attach search handlers
649
- search_button.click(_update_search_results, inputs=[search_bar, generated_texts_state], outputs=buttons + [generated_texts_state] + button_groups)
650
- search_bar.submit(_update_search_results, inputs=[search_bar, generated_texts_state], outputs=buttons + [generated_texts_state] + button_groups)
 
 
 
 
 
 
 
 
651
 
652
  # Load More Datasets
653
- load_more_datasets.click(_update_search_results, inputs=[search_bar, generated_texts_state], outputs=buttons + [generated_texts_state] + button_groups)
 
 
 
 
654
 
655
  # Display Single Dataset Details
656
- def _show_dataset_details(search_query, dataset_name, tags):
657
  """Switches to detail view and loads dataset content."""
658
  yield {
659
  search_page: gr.Column(visible=False), dataset_page: gr.Column(visible=True),
660
- dataset_title: f"# {dataset_name}\n\n tags: {tags}",
661
- dataset_share_textbox: gr.Textbox(visible=False), dataset_dataframe: gr.DataFrame(visible=False),
662
- generate_full_dataset_button: gr.Button(interactive=True), save_dataset_button: gr.Button(visible=False),
 
663
  open_dataset_message: gr.Markdown("", visible=False)
664
  }
 
 
 
 
 
 
 
 
 
 
 
665
  # Stream content generation
666
- for content_chunk in generate_dataset_content(search_query, dataset_name, tags, []):
667
- yield {dataset_content: content_chunk}
668
 
669
  # Link buttons to the detail view function
670
- show_dataset_inputs = [search_bar, *buttons]
671
- show_dataset_outputs = [search_page, dataset_page, dataset_title, dataset_content, generate_full_dataset_button, dataset_dataframe, save_dataset_button, open_dataset_message, dataset_share_textbox]
672
-
673
- scroll_to_top_js = """
674
- function() {
675
- if ('parentIFrame' in window) { window.parentIFrame.scrollTo({top: 0, behavior:'smooth'}); }
676
- else { window.scrollTo({ top: 0, behavior: 'smooth' }); }
677
- return Array.from(arguments);
678
- }
679
- """
680
- def _show_dataset_from_button(search_query, *buttons_values, i):
681
- dataset_name, tags = buttons_values[2 * i : 2 * i + 2]
682
- yield from _show_dataset_details(search_query, dataset_name, tags)
 
 
 
 
 
683
 
 
684
  for i, (name_btn, tag_btn) in enumerate(batched(buttons, 2)):
685
- name_btn.click(partial(_show_dataset_from_button, i=i), inputs=show_dataset_inputs, outputs=show_dataset_outputs, js=scroll_to_top_js)
686
- tag_btn.click(partial(_show_dataset_from_button, i=i), inputs=show_dataset_inputs, outputs=show_dataset_outputs, js=scroll_to_top_js)
 
 
 
 
 
 
 
 
687
 
688
  # Back Button Navigation
689
- back_button.click(lambda: (gr.Column(visible=True), gr.Column(visible=False)), outputs=[search_page, dataset_page], js=scroll_to_top_js)
 
 
 
 
 
 
690
 
691
  # Full Dataset Generation
692
  @generate_full_dataset_button.click(
693
- inputs=[dataset_title, dataset_content, search_bar, select_namespace_dropdown, visibility_radio, refinement_mode],
694
- outputs=[dataset_dataframe, generate_full_dataset_button, save_dataset_button]
695
  )
696
- def _generate_full_dataset(title, content, search_query, namespace, visibility, mode):
697
- dataset_name, tags = title.strip("# ").split("\ntags:", 1)
698
- dataset_name, tags = dataset_name.strip(), tags.strip()
699
-
700
- try: csv_header, preview_df = parse_preview_df(content)
 
 
 
 
701
  except ValueError as e: raise gr.Error(f"Failed to parse preview: {e}")
702
 
703
  refined_preview_df = refine_preview_data(preview_df, mode)
@@ -710,9 +828,10 @@ with gr.Blocks(css=css) as demo:
710
 
711
  # Update UI: show preview, disable generate, show save button
712
  yield {
713
- dataset_dataframe: gr.DataFrame(pd.DataFrame([r for r in output_data if r]), visible=True),
714
  generate_full_dataset_button: gr.Button(interactive=False),
715
- save_dataset_button: gr.Button(f"💾 Save {namespace}/{dataset_name}" + (" (private)" if visibility != "public" else ""), visible=True, interactive=False)
 
716
  }
717
 
718
  # Prepare generation tasks for variants
@@ -724,27 +843,32 @@ with gr.Blocks(css=css) as demo:
724
  generation_tasks.append({
725
  "func": generate_partial_dataset,
726
  "kwargs": {
727
- "title": title, "content": content, "search_query": search_query, "variant": variant,
728
  "csv_header": csv_header, "output": output_data, "indices_to_generate": indices,
729
- "history": [] # Use fresh history for each variant task
 
730
  }
731
  })
732
 
733
  # Execute tasks in parallel and update UI progressively
734
  for _ in iflatmap_unordered(lambda **kw: kw.pop('func')(**kw), generation_tasks):
735
- yield {dataset_dataframe: pd.DataFrame([r for r in output_data if r])} # Update DataFrame display
736
 
737
  yield {save_dataset_button: gr.Button(interactive=True)} # Enable save button
738
  print(f"Full dataset generation complete for {dataset_name}.")
739
 
740
  # Save Dataset to Hugging Face Hub
741
  @save_dataset_button.click(
742
- inputs=[dataset_title, dataset_content, search_bar, dataset_dataframe, select_namespace_dropdown, visibility_radio],
743
  outputs=[save_dataset_button, open_dataset_message]
744
  )
745
- def _save_dataset(title, content, search_query, df, namespace, visibility, oauth_token):
746
- dataset_name, tags = title.strip("# ").split("\ntags:", 1)
747
- dataset_name, tags = dataset_name.strip(), tags.strip()
 
 
 
 
748
 
749
  token = oauth_token.token if oauth_token else save_dataset_hf_token
750
  if not token: raise gr.Error("Login required or set SAVE_DATASET_HF_TOKEN.")
@@ -760,7 +884,7 @@ with gr.Blocks(css=css) as demo:
760
  create_repo(repo_id=repo_id, repo_type="dataset", private=visibility!="public", exist_ok=True, token=token)
761
  df.to_csv(f"hf://datasets/{repo_id}/data.csv", storage_options={"token": token}, index=False)
762
 
763
- card_content = DATASET_CARD_CONTENT.format(title=title, content=content, url=URL, dataset_url=dataset_url, model_id=model_id, search_query=search_query)
764
  DatasetCard(card_content).push_to_hub(repo_id=repo_id, repo_type="dataset", token=token)
765
 
766
  success_msg = f"# 🎉 Yay! Dataset saved to [{repo_id}](https://huggingface.co/datasets/{repo_id})!\n\n_PS: Check Settings to manage your saved datasets._"
@@ -772,16 +896,22 @@ with gr.Blocks(css=css) as demo:
772
  finally: yield {save_dataset_button: gr.Button(interactive=True)} # Re-enable button
773
 
774
  # Shareable URL Generation
775
- @dataset_share_button.click(inputs=[dataset_title, search_bar], outputs=[dataset_share_textbox])
776
- def _show_share_url(title, search_query):
777
- dataset_name, tags = title.strip("# ").split("\ntags:", 1)
778
- dataset_name, tags = dataset_name.strip(), tags.strip()
 
 
 
 
779
  share_url = f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}"
780
  return gr.Textbox(share_url, visible=True)
781
 
782
  # Settings Toggles
783
  refinement_mode.change(lambda mode: gr.Group(visible=(mode == "sourced")), outputs=[source_group])
784
 
 
 
785
  @load_source_button.click(inputs=[source_type, source_path], outputs=[source_status])
786
  def _load_source_data(source_type, source_path):
787
  if not source_path: raise gr.Error("Source path/URL is required.")
@@ -792,8 +922,50 @@ with gr.Blocks(css=css) as demo:
792
  except (ConnectionError, ValueError, RuntimeError) as e:
793
  raise gr.Error(f"Failed to load source: {str(e)}")
794
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
795
  # Initial App Load Logic
796
- @demo.load(outputs=show_dataset_outputs + buttons + [generated_texts_state] + [select_namespace_dropdown, visibility_radio, source_group])
 
 
797
  def _load_app(request: gr.Request, oauth_token: Optional[gr.OAuthToken]):
798
  # Handle user login and namespace selection
799
  if oauth_token:
@@ -818,13 +990,38 @@ with gr.Blocks(css=css) as demo:
818
  # Handle URL parameters for direct search or dataset loading
819
  query_params = dict(request.query_params)
820
  if "dataset" in query_params:
821
- yield from _show_dataset_details(query_params.get("q", query_params["dataset"]), query_params["dataset"], query_params.get("tags", ""))
 
 
 
822
  elif "q" in query_params:
823
  search_query = query_params["q"]
 
 
824
  yield {search_bar: search_query}
825
- yield from _update_search_results(search_query, ()) # Perform initial search
 
826
  else:
827
  yield {search_page: gr.Column(visible=True)} # Show search page by default
828
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
829
  if __name__ == "__main__":
830
  demo.launch(share=False, server_name="0.0.0.0")
 
348
  print(f"Unexpected LLM error (attempt {attempt+1}): {e}. Retrying...")
349
  time.sleep(2**attempt)
350
 
351
+ def generate_dataset_names(search_query: str, history: list[Dict[str, str]], is_real_data: bool = False, engine: Optional[str] = None) -> Iterator[str]:
352
  """Generates dataset names based on a search query using the LLM."""
353
  query = search_query[:1000] if search_query else ""
354
+
355
+ if is_real_data and engine:
356
+ prompt = (
357
+ f"@Claude-3.7-Sonnet You are a data specialist who can transform real search results into structured datasets. "
358
+ f"A user is searching for data about: \"{query}\" "
359
+ f"Imagine you've queried {engine} and received real search results. Create a list of {MAX_NB_ITEMS_PER_GENERATION_CALL} specific datasets that could be created from these search results. "
360
+ f"For each dataset: 1. Give it a clear, specific name related to the search topic. 2. Include 3-5 relevant tags in parentheses, with one tag specifying the ML task type (classification, regression, clustering, etc.). "
361
+ f"Format each dataset as: 1. DatasetName (tag1, tag2, ml_task_tag). Make these datasets sound like real collections that could be created from {engine} search results on \"{query}\"."
362
+ )
363
+ else:
364
+ prompt = GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY.format(search_query=query)
365
 
366
  full_response = ""
367
  for token in stream_response(prompt, history):
 
372
  history.append({"role": "assistant", "content": full_response}) # Update history
373
  # No return needed as history is modified in place
374
 
375
+ def generate_dataset_content(search_query: str, dataset_name: str, tags: str, history: list[Dict[str, str]], is_real_data: bool = False, engine: Optional[str] = None) -> Iterator[str]:
376
  """Generates the description and CSV preview for a dataset."""
377
  query = search_query[:1000] if search_query else ""
378
+
379
+ if is_real_data and engine:
380
+ prompt = (
381
+ f"@Claude-3.7-Sonnet You're a specialist in converting web search results into structured data. "
382
+ f"Based on search results from {engine} about \"{query}\", create a preview of the dataset \"{dataset_name}\" with tags \"{tags}\". "
383
+ f"First, write a detailed description of what this dataset contains, its structure, and how it was constructed from web search results. "
384
+ f"Then, generate a realistic 5-row CSV preview that resembles data you might get if you scraped and structured real results from {engine}. "
385
+ f"Format your response with: **Dataset Description:** [detailed description] **CSV Content Preview:** ```csv [CSV header and 5 rows of realistic data] ``` "
386
+ f"Include relevant columns for the dataset type, with proper labels/categories where appropriate. The data should look like it came from real sources."
387
+ )
388
+ else:
389
+ prompt = GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
390
+ search_query=query, dataset_name=dataset_name, tags=tags
391
+ )
392
 
393
  full_response = ""
394
  for token in stream_response(prompt, history):
 
432
 
433
  def generate_partial_dataset(
434
  title: str, content: str, search_query: str, variant: str, csv_header: str,
435
+ output: list[Optional[dict]], indices_to_generate: list[int], history: list[Dict[str, str]],
436
+ is_real_data: bool = False, engine: Optional[str] = None
437
  ) -> Iterator[int]:
438
  """Generates a batch of dataset rows for a specific variant."""
439
  dataset_name, tags = title.strip("# ").split("\ntags:", 1)
440
  dataset_name, tags = dataset_name.strip(), tags.strip()
441
 
442
  prompt = GENERATE_MORE_ROWS.format(csv_header=csv_header) + " " + variant
443
+
444
+ # Construct initial messages for context
445
+ initial_prompt = ""
446
+ if is_real_data and engine:
447
+ initial_prompt = (
448
+ f"@Claude-3.7-Sonnet You're a specialist in converting web search results into structured data. "
449
+ f"Based on search results from {engine} about \"{search_query}\", create a preview of the dataset \"{dataset_name}\" with tags \"{tags}\". "
450
+ f"First, write a detailed description of what this dataset contains, its structure, and how it was constructed from web search results. "
451
+ f"Then, generate a realistic 5-row CSV preview that resembles data you might get if you scraped and structured real results from {engine}. "
452
+ f"Format your response with: **Dataset Description:** [detailed description] **CSV Content Preview:** ```csv [CSV header and 5 rows of realistic data] ``` "
453
+ f"Include relevant columns for the dataset type, with proper labels/categories where appropriate. The data should look like it came from real sources."
454
+ )
455
+ else:
456
+ initial_prompt = GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
457
+ search_query=search_query, dataset_name=dataset_name, tags=tags
458
+ )
459
+
460
  messages = [
461
+ {"role": "user", "content": initial_prompt},
462
  {"role": "assistant", "content": title + "\n\n" + content},
463
  {"role": "user", "content": prompt},
464
  ]
 
570
 
571
  with gr.Blocks(css=css) as demo:
572
  generated_texts_state = gr.State((landing_page_datasets_generated_text,)) # State for generated dataset names
573
+ current_dataset_state = gr.State(None) # State to hold current dataset details for generation
574
+ is_real_data_state = gr.State(True) # State to track if real data is being used
575
+ current_engine_state = gr.State(None) # State to track the current search engine
576
+ selected_engines_state = gr.State(["DuckDuckGo.com", "Bing.com", "Search.Yahoo.com", "Search.Brave.com", "Ecosia.org"]) # Default selected engines
577
+ searchEngines = ["AlltheInternet.com", "DuckDuckGo.com", "Google.com", "Bing.com", "Search.Yahoo.com", "Startpage.com", "Qwant.com", "Ecosia.org", "WolframAlpha.com", "Mojeek.co.uk", "Search.Brave.com", "Yandex.com", "Baidu.com", "Gibiru.com", "MetaGer.org", "Swisscows.com", "Presearch.com", "Ekoru.org", "Search.Lilo.org"]
578
 
579
  # --- Search Page UI ---
580
  with gr.Column(visible=True, elem_id="search-page") as search_page:
 
635
  label="Dataset Visibility", info="Set visibility for datasets saved to Hugging Face Hub."
636
  )
637
 
638
+ # Search Engine Settings
639
+ gr.Markdown("Search Engine Configuration")
640
+ data_source_toggle = gr.Checkbox(label="Use Real Search Data", value=True, info="Toggle to include results from real search engines.")
641
+ engine_settings_button = gr.Button("Configure Search Engines", icon="https://img.icons8.com/ios-filled/50/000000/settings--v1.png", size="sm")
642
+
643
+ # Engine Selection Modal
644
+ with gr.Modal("Search Engine Settings", id="engine-modal") as engine_modal:
645
+ gr.Markdown("Select which search engines to use for real data retrieval. A diverse selection improves results.")
646
+ engine_options_html_comp = gr.HTML(elem_id="engine-options")
647
+ with gr.Row():
648
+ select_all_engines_btn = gr.Button("Select All")
649
+ deselect_all_engines_btn = gr.Button("Deselect All")
650
+ save_engines_btn = gr.Button("Save Settings", variant="primary")
651
+
652
  # --- Dataset Detail Page UI ---
653
  with gr.Column(visible=False, elem_id="dataset-page") as dataset_page:
654
  gr.Markdown("# 🤗 Infinite Dataset Hub ♾️\n\nAn endless catalog of datasets, created just for you.")
655
+ dataset_title_md = gr.Markdown() # Dataset name and tags
656
+ dataset_source_badge = gr.Markdown() # Badge indicating real/AI data
657
+ dataset_source_info = gr.Markdown() # Details about the data source
658
+ dataset_description_md = gr.Markdown() # Dataset description
659
+ preview_table_comp = gr.DataFrame(visible=False, interactive=False, wrap=True) # Holds the preview CSV
660
+
661
+ with gr.Row():
662
+ generate_full_dataset_button = gr.Button("Generate Full Dataset", variant="primary")
663
+ save_dataset_button = gr.Button("💾 Save Dataset", variant="primary", visible=False)
664
+
665
  open_dataset_message = gr.Markdown("", visible=False) # Confirmation message
666
  dataset_share_button = gr.Button("Share Dataset URL")
667
  dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
668
+
669
+ full_dataset_section = gr.Column(visible=False) # Container for full dataset and downloads
670
+ full_table_comp = gr.DataFrame(visible=False, interactive=False, wrap=True)
671
+ with gr.Row():
672
+ download_csv_button = gr.Button("Download CSV")
673
+ download_json_button = gr.Button("Download JSON")
674
+ download_parquet_button = gr.Button("Download Parquet")
675
+
676
  back_button = gr.Button("< Back", size="sm")
677
 
678
  # --- Event Handlers ---
679
 
680
  # Search Logic
681
+ def _update_search_results(search_query: str, current_generated_texts: tuple[str], is_real_data: bool, engine: Optional[str]):
682
  """Handles dataset search and UI updates."""
683
  # Reset UI to loading state
684
  yield {btn: gr.Button("⬜⬜⬜⬜⬜⬜", elem_classes="topButton linear-background") for btn in buttons[::2]}
 
690
 
691
  try:
692
  # Generate dataset names from LLM
693
+ for line in generate_dataset_names(search_query, [], is_real_data=is_real_data, engine=engine):
694
  if "I'm sorry" in line or "policy" in line: raise gr.Error("Inappropriate content detected.")
695
  if generated_count >= MAX_NB_ITEMS_PER_GENERATION_CALL: break
696
 
 
716
  except Exception as e: raise gr.Error(f"Failed to generate datasets: {str(e)}")
717
 
718
  # Attach search handlers
719
+ search_button.click(
720
+ _update_search_results,
721
+ inputs=[search_bar, generated_texts_state, is_real_data_state, current_engine_state],
722
+ outputs=buttons + [generated_texts_state] + button_groups
723
+ )
724
+ search_bar.submit(
725
+ _update_search_results,
726
+ inputs=[search_bar, generated_texts_state, is_real_data_state, current_engine_state],
727
+ outputs=buttons + [generated_texts_state] + button_groups
728
+ )
729
 
730
  # Load More Datasets
731
+ load_more_datasets.click(
732
+ _update_search_results,
733
+ inputs=[search_bar, generated_texts_state, is_real_data_state, current_engine_state],
734
+ outputs=buttons + [generated_texts_state] + button_groups
735
+ )
736
 
737
  # Display Single Dataset Details
738
+ def _show_dataset_details(search_query, dataset_name, tags, is_real_data, engine):
739
  """Switches to detail view and loads dataset content."""
740
  yield {
741
  search_page: gr.Column(visible=False), dataset_page: gr.Column(visible=True),
742
+ dataset_title_md: f"# {dataset_name}\n\n tags: {tags}",
743
+ dataset_share_textbox: gr.Textbox(visible=False),
744
+ full_dataset_section: gr.Column(visible=False),
745
+ save_dataset_button: gr.Button(visible=False),
746
  open_dataset_message: gr.Markdown("", visible=False)
747
  }
748
+
749
+ # Update source badge and info
750
+ if is_real_data:
751
+ badge_html = gr.Markdown(f'<span class="px-3 py-1 rounded-full text-xs font-medium bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200">Real Data</span>', visible=True)
752
+ info_html = gr.Markdown(f'This dataset is based on real information queried from <strong>{engine}</strong> for the search term "<strong>{search_query}</strong>". The data has been structured for machine learning use.', visible=True)
753
+ else:
754
+ badge_html = gr.Markdown('<span class="px-3 py-1 rounded-full text-xs font-medium bg-purple-100 text-purple-800 dark:bg-purple-900 dark:text-purple-200">AI-Generated</span>', visible=True)
755
+ info_html = gr.Markdown(f'This is an AI-generated dataset created using {model_id}. The content is synthetic and designed to represent plausible data related to "{search_query}".', visible=True)
756
+
757
+ yield {dataset_source_badge: badge_html, dataset_source_info: info_html}
758
+
759
  # Stream content generation
760
+ for content_chunk in generate_dataset_content(search_query, dataset_name, tags, [], is_real_data=is_real_data, engine=engine):
761
+ yield {dataset_description_md: content_chunk}
762
 
763
  # Link buttons to the detail view function
764
+ def _show_dataset_from_button_wrapper(search_query, *buttons_values):
765
+ # Determine which button was clicked to get the index
766
+ clicked_button_index = -1
767
+ for i, btn_val in enumerate(buttons_values):
768
+ if btn_val is not None and btn_val != "": # Assuming non-empty value indicates the clicked button's text
769
+ clicked_button_index = i
770
+ break
771
+
772
+ if clicked_button_index == -1: return # Should not happen if events are correctly wired
773
+
774
+ # Determine if it was a name button (even index) or tag button (odd index)
775
+ dataset_index = clicked_button_index // 2
776
+
777
+ dataset_name, tags = buttons_values[2 * dataset_index], buttons_values[2 * dataset_index + 1]
778
+ is_real_data = current_engine_state.value is not None # Infer from engine state
779
+ engine = current_engine_state.value if is_real_data else None
780
+
781
+ yield from _show_dataset_details(search_query, dataset_name, tags, is_real_data, engine)
782
 
783
+ # Wire up click events for all dataset name and tag buttons
784
  for i, (name_btn, tag_btn) in enumerate(batched(buttons, 2)):
785
+ name_btn.click(
786
+ partial(_show_dataset_from_button_wrapper),
787
+ inputs=[search_bar, *buttons],
788
+ outputs=[search_page, dataset_page, dataset_title_md, dataset_description_md, dataset_source_badge, dataset_source_info, dataset_share_textbox, full_dataset_section, save_dataset_button, open_dataset_message]
789
+ )
790
+ tag_btn.click(
791
+ partial(_show_dataset_from_button_wrapper),
792
+ inputs=[search_bar, *buttons],
793
+ outputs=[search_page, dataset_page, dataset_title_md, dataset_description_md, dataset_source_badge, dataset_source_info, dataset_share_textbox, full_dataset_section, save_dataset_button, open_dataset_message]
794
+ )
795
 
796
  # Back Button Navigation
797
+ back_button.click(lambda: (gr.Column(visible=True), gr.Column(visible=False)), outputs=[search_page, dataset_page], js="""
798
+ function() {
799
+ if ('parentIFrame' in window) { window.parentIFrame.scrollTo({top: 0, behavior:'smooth'}); }
800
+ else { window.scrollTo({ top: 0, behavior: 'smooth' }); }
801
+ return Array.from(arguments);
802
+ }
803
+ """)
804
 
805
  # Full Dataset Generation
806
  @generate_full_dataset_button.click(
807
+ inputs=[dataset_title_md, dataset_description_md, search_bar, select_namespace_dropdown, visibility_radio, refinement_mode, is_real_data_state, current_engine_state],
808
+ outputs=[full_table_comp, generate_full_dataset_button, save_dataset_button, full_dataset_section]
809
  )
810
+ def _generate_full_dataset(title_md, content_md, search_query, namespace, visibility, mode, is_real_data, engine):
811
+ # Extract dataset name and tags from the markdown title
812
+ try:
813
+ dataset_name = title_md.split('\n')[0].strip('# ')
814
+ tags = title_md.split('tags:', 1)[1].strip()
815
+ except IndexError:
816
+ raise gr.Error("Could not parse dataset title.")
817
+
818
+ try: csv_header, preview_df = parse_preview_df(content_md)
819
  except ValueError as e: raise gr.Error(f"Failed to parse preview: {e}")
820
 
821
  refined_preview_df = refine_preview_data(preview_df, mode)
 
828
 
829
  # Update UI: show preview, disable generate, show save button
830
  yield {
831
+ full_table_comp: gr.DataFrame(pd.DataFrame([r for r in output_data if r]), visible=True),
832
  generate_full_dataset_button: gr.Button(interactive=False),
833
+ save_dataset_button: gr.Button(f"💾 Save {namespace}/{dataset_name}" + (" (private)" if visibility != "public" else ""), visible=True, interactive=False),
834
+ full_dataset_section: gr.Column(visible=True)
835
  }
836
 
837
  # Prepare generation tasks for variants
 
843
  generation_tasks.append({
844
  "func": generate_partial_dataset,
845
  "kwargs": {
846
+ "title": title_md, "content": content_md, "search_query": search_query, "variant": variant,
847
  "csv_header": csv_header, "output": output_data, "indices_to_generate": indices,
848
+ "history": [], # Use fresh history for each variant task
849
+ "is_real_data": is_real_data, "engine": engine
850
  }
851
  })
852
 
853
  # Execute tasks in parallel and update UI progressively
854
  for _ in iflatmap_unordered(lambda **kw: kw.pop('func')(**kw), generation_tasks):
855
+ yield {full_table_comp: pd.DataFrame([r for r in output_data if r])} # Update DataFrame display
856
 
857
  yield {save_dataset_button: gr.Button(interactive=True)} # Enable save button
858
  print(f"Full dataset generation complete for {dataset_name}.")
859
 
860
  # Save Dataset to Hugging Face Hub
861
  @save_dataset_button.click(
862
+ inputs=[dataset_title_md, dataset_description_md, search_bar, full_table_comp, select_namespace_dropdown, visibility_radio],
863
  outputs=[save_dataset_button, open_dataset_message]
864
  )
865
+ def _save_dataset(title_md, content_md, search_query, df, namespace, visibility, oauth_token):
866
+ # Extract dataset name and tags from the markdown title
867
+ try:
868
+ dataset_name = title_md.split('\n')[0].strip('# ')
869
+ tags = title_md.split('tags:', 1)[1].strip()
870
+ except IndexError:
871
+ raise gr.Error("Could not parse dataset title.")
872
 
873
  token = oauth_token.token if oauth_token else save_dataset_hf_token
874
  if not token: raise gr.Error("Login required or set SAVE_DATASET_HF_TOKEN.")
 
884
  create_repo(repo_id=repo_id, repo_type="dataset", private=visibility!="public", exist_ok=True, token=token)
885
  df.to_csv(f"hf://datasets/{repo_id}/data.csv", storage_options={"token": token}, index=False)
886
 
887
+ card_content = DATASET_CARD_CONTENT.format(title=title_md, content=content_md, url=URL, dataset_url=dataset_url, model_id=model_id, search_query=search_query)
888
  DatasetCard(card_content).push_to_hub(repo_id=repo_id, repo_type="dataset", token=token)
889
 
890
  success_msg = f"# 🎉 Yay! Dataset saved to [{repo_id}](https://huggingface.co/datasets/{repo_id})!\n\n_PS: Check Settings to manage your saved datasets._"
 
896
  finally: yield {save_dataset_button: gr.Button(interactive=True)} # Re-enable button
897
 
898
  # Shareable URL Generation
899
+ @dataset_share_button.click(inputs=[dataset_title_md, search_bar], outputs=[dataset_share_textbox])
900
+ def _show_share_url(title_md, search_query):
901
+ try:
902
+ dataset_name = title_md.split('\n')[0].strip('# ')
903
+ tags = title_md.split('tags:', 1)[1].strip()
904
+ except IndexError:
905
+ raise gr.Error("Could not parse dataset title.")
906
+
907
  share_url = f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}"
908
  return gr.Textbox(share_url, visible=True)
909
 
910
  # Settings Toggles
911
  refinement_mode.change(lambda mode: gr.Group(visible=(mode == "sourced")), outputs=[source_group])
912
 
913
+ data_source_toggle.change(lambda value: (gr.State(value), gr.State(value if value else None)), inputs=[data_source_toggle], outputs=[is_real_data_state, current_engine_state])
914
+
915
  @load_source_button.click(inputs=[source_type, source_path], outputs=[source_status])
916
  def _load_source_data(source_type, source_path):
917
  if not source_path: raise gr.Error("Source path/URL is required.")
 
922
  except (ConnectionError, ValueError, RuntimeError) as e:
923
  raise gr.Error(f"Failed to load source: {str(e)}")
924
 
925
+ # Engine Settings Modal Logic
926
+ def _populate_engine_options(selected_engines):
927
+ engine_options_html = ""
928
+ for engine in searchEngines:
929
+ is_checked = "checked" if engine in selected_engines else ""
930
+ engine_options_html += f"""
931
+ <div class="flex items-center">
932
+ <input type="checkbox" id="engine-{engine.replace('.', '_')}" class="engine-checkbox mr-2 h-4 w-4" value="{engine}" {is_checked}>
933
+ <label for="engine-{engine.replace('.', '_')}" class="cursor-pointer">{engine}</label>
934
+ </div>
935
+ """
936
+ return gr.HTML(engine_options_html)
937
+
938
+ def _save_engine_settings(selected_engines_json):
939
+ selected_engines = json.loads(selected_engines_json)
940
+ if not selected_engines:
941
+ gr.Warning("At least one search engine must be selected. Using DuckDuckGo as default.")
942
+ selected_engines = ["DuckDuckGo.com"]
943
+
944
+ current_engine = selected_engines[0] if selected_engines else None
945
+ return gr.State(selected_engines), gr.State(current_engine), gr.Info(f"Updated search engines. Using {len(selected_engines)} engines.")
946
+
947
+ # Initialize engine options component
948
+ engine_options_html_comp = _populate_engine_options(selected_engines_state.value)
949
+
950
+ # Update engine options when the modal is opened
951
+ engine_settings_button.click(lambda: engine_options_html_comp.update(_populate_engine_options(selected_engines_state.value)), outputs=[engine_options_html_comp])
952
+
953
+ select_all_engines_btn.click(lambda: engine_options_html_comp.update(_populate_engine_options(searchEngines)), outputs=[engine_options_html_comp])
954
+ deselect_all_engines_btn.click(lambda: engine_options_html_comp.update(_populate_engine_options([])), outputs=[engine_options_html_comp])
955
+
956
+ save_engines_btn.click(
957
+ _save_engine_settings,
958
+ inputs=[gr.JSON(elem_id="engine-options")], # Capture checked engines from modal
959
+ outputs=[selected_engines_state, current_engine_state, gr.Info()]
960
+ )
961
+
962
+ engine_settings_button.click(lambda: engine_modal.update(visible=True), outputs=[engine_modal])
963
+ # Close modal on save or when clicking outside (implicit via Gradio's modal handling)
964
+
965
  # Initial App Load Logic
966
+ @demo.load(outputs=([search_page, dataset_page, dataset_title_md, dataset_description_md, dataset_source_badge, dataset_source_info, dataset_share_textbox, full_dataset_section, save_dataset_button, open_dataset_message, search_bar] + # Outputs for detail page and search bar
967
+ buttons + [generated_texts_state] + # Outputs for search results buttons and state
968
+ [select_namespace_dropdown, visibility_radio, source_group, data_source_toggle, current_engine_state, selected_engines_state, engine_options_html_comp])) # Outputs for settings
969
  def _load_app(request: gr.Request, oauth_token: Optional[gr.OAuthToken]):
970
  # Handle user login and namespace selection
971
  if oauth_token:
 
990
  # Handle URL parameters for direct search or dataset loading
991
  query_params = dict(request.query_params)
992
  if "dataset" in query_params:
993
+ is_real = query_params.get("engine") is not None
994
+ engine = query_params.get("engine")
995
+ yield from _show_dataset_details(query_params.get("q", query_params["dataset"]), query_params["dataset"], query_params.get("tags", ""), is_real, engine)
996
+ yield {is_real_data_state: is_real, current_engine_state: engine}
997
  elif "q" in query_params:
998
  search_query = query_params["q"]
999
+ is_real = query_params.get("engine") is not None
1000
+ engine = query_params.get("engine")
1001
  yield {search_bar: search_query}
1002
+ yield {is_real_data_state: is_real, current_engine_state: engine}
1003
+ yield from _update_search_results(search_query, (), is_real, engine)
1004
  else:
1005
  yield {search_page: gr.Column(visible=True)} # Show search page by default
1006
 
1007
+ # Initialize with default datasets
1008
+ initial_outputs = {}
1009
+ for i, line in enumerate(default_output):
1010
+ try: dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1)
1011
+ except ValueError: dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" ", 1)[0], ""
1012
+
1013
+ initial_outputs[buttons[2 * i]] = gr.Button(dataset_name, elem_classes="topButton")
1014
+ initial_outputs[buttons[2 * i + 1]] = gr.Button(tags, elem_classes="bottomButton")
1015
+ initial_outputs[button_groups[i]] = gr.Group(elem_classes="buttonsGroup")
1016
+ yield initial_outputs
1017
+ yield {generated_texts_state: (landing_page_datasets_generated_text,)}
1018
+
1019
+ # Initialize engine settings UI
1020
+ yield {
1021
+ data_source_toggle: gr.Checkbox(value=is_real_data_state.value),
1022
+ engine_options_html_comp: _populate_engine_options(selected_engines_state.value)
1023
+ }
1024
+
1025
+
1026
  if __name__ == "__main__":
1027
  demo.launch(share=False, server_name="0.0.0.0")