Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -348,10 +348,20 @@ def stream_response(msg: str, history: list[Dict[str, str]] = [], max_tokens=500
|
|
348 |
print(f"Unexpected LLM error (attempt {attempt+1}): {e}. Retrying...")
|
349 |
time.sleep(2**attempt)
|
350 |
|
351 |
-
def generate_dataset_names(search_query: str, history: list[Dict[str, str]]) -> Iterator[str]:
|
352 |
"""Generates dataset names based on a search query using the LLM."""
|
353 |
query = search_query[:1000] if search_query else ""
|
354 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
355 |
|
356 |
full_response = ""
|
357 |
for token in stream_response(prompt, history):
|
@@ -362,12 +372,23 @@ def generate_dataset_names(search_query: str, history: list[Dict[str, str]]) ->
|
|
362 |
history.append({"role": "assistant", "content": full_response}) # Update history
|
363 |
# No return needed as history is modified in place
|
364 |
|
365 |
-
def generate_dataset_content(search_query: str, dataset_name: str, tags: str, history: list[Dict[str, str]]) -> Iterator[str]:
|
366 |
"""Generates the description and CSV preview for a dataset."""
|
367 |
query = search_query[:1000] if search_query else ""
|
368 |
-
|
369 |
-
|
370 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
371 |
|
372 |
full_response = ""
|
373 |
for token in stream_response(prompt, history):
|
@@ -411,15 +432,33 @@ def iflatmap_unordered(func: Callable, kwargs_iterable: Iterable[dict]) -> Itera
|
|
411 |
|
412 |
def generate_partial_dataset(
|
413 |
title: str, content: str, search_query: str, variant: str, csv_header: str,
|
414 |
-
output: list[Optional[dict]], indices_to_generate: list[int], history: list[Dict[str, str]]
|
|
|
415 |
) -> Iterator[int]:
|
416 |
"""Generates a batch of dataset rows for a specific variant."""
|
417 |
dataset_name, tags = title.strip("# ").split("\ntags:", 1)
|
418 |
dataset_name, tags = dataset_name.strip(), tags.strip()
|
419 |
|
420 |
prompt = GENERATE_MORE_ROWS.format(csv_header=csv_header) + " " + variant
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
messages = [
|
422 |
-
{"role": "user", "content":
|
423 |
{"role": "assistant", "content": title + "\n\n" + content},
|
424 |
{"role": "user", "content": prompt},
|
425 |
]
|
@@ -531,6 +570,11 @@ def get_repo_visibility(repo_id: str, token: str) -> str:
|
|
531 |
|
532 |
with gr.Blocks(css=css) as demo:
|
533 |
generated_texts_state = gr.State((landing_page_datasets_generated_text,)) # State for generated dataset names
|
|
|
|
|
|
|
|
|
|
|
534 |
|
535 |
# --- Search Page UI ---
|
536 |
with gr.Column(visible=True, elem_id="search-page") as search_page:
|
@@ -591,24 +635,50 @@ with gr.Blocks(css=css) as demo:
|
|
591 |
label="Dataset Visibility", info="Set visibility for datasets saved to Hugging Face Hub."
|
592 |
)
|
593 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
594 |
# --- Dataset Detail Page UI ---
|
595 |
with gr.Column(visible=False, elem_id="dataset-page") as dataset_page:
|
596 |
gr.Markdown("# 🤗 Infinite Dataset Hub ♾️\n\nAn endless catalog of datasets, created just for you.")
|
597 |
-
|
598 |
-
gr.Markdown(
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
|
|
|
|
|
|
|
|
603 |
open_dataset_message = gr.Markdown("", visible=False) # Confirmation message
|
604 |
dataset_share_button = gr.Button("Share Dataset URL")
|
605 |
dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
606 |
back_button = gr.Button("< Back", size="sm")
|
607 |
|
608 |
# --- Event Handlers ---
|
609 |
|
610 |
# Search Logic
|
611 |
-
def _update_search_results(search_query: str, current_generated_texts: tuple[str]):
|
612 |
"""Handles dataset search and UI updates."""
|
613 |
# Reset UI to loading state
|
614 |
yield {btn: gr.Button("⬜⬜⬜⬜⬜⬜", elem_classes="topButton linear-background") for btn in buttons[::2]}
|
@@ -620,7 +690,7 @@ with gr.Blocks(css=css) as demo:
|
|
620 |
|
621 |
try:
|
622 |
# Generate dataset names from LLM
|
623 |
-
for line in generate_dataset_names(search_query, []):
|
624 |
if "I'm sorry" in line or "policy" in line: raise gr.Error("Inappropriate content detected.")
|
625 |
if generated_count >= MAX_NB_ITEMS_PER_GENERATION_CALL: break
|
626 |
|
@@ -646,58 +716,106 @@ with gr.Blocks(css=css) as demo:
|
|
646 |
except Exception as e: raise gr.Error(f"Failed to generate datasets: {str(e)}")
|
647 |
|
648 |
# Attach search handlers
|
649 |
-
search_button.click(
|
650 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
651 |
|
652 |
# Load More Datasets
|
653 |
-
load_more_datasets.click(
|
|
|
|
|
|
|
|
|
654 |
|
655 |
# Display Single Dataset Details
|
656 |
-
def _show_dataset_details(search_query, dataset_name, tags):
|
657 |
"""Switches to detail view and loads dataset content."""
|
658 |
yield {
|
659 |
search_page: gr.Column(visible=False), dataset_page: gr.Column(visible=True),
|
660 |
-
|
661 |
-
dataset_share_textbox: gr.Textbox(visible=False),
|
662 |
-
|
|
|
663 |
open_dataset_message: gr.Markdown("", visible=False)
|
664 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
665 |
# Stream content generation
|
666 |
-
for content_chunk in generate_dataset_content(search_query, dataset_name, tags, []):
|
667 |
-
yield {
|
668 |
|
669 |
# Link buttons to the detail view function
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
-
|
682 |
-
|
|
|
|
|
|
|
|
|
|
|
683 |
|
|
|
684 |
for i, (name_btn, tag_btn) in enumerate(batched(buttons, 2)):
|
685 |
-
name_btn.click(
|
686 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
687 |
|
688 |
# Back Button Navigation
|
689 |
-
back_button.click(lambda: (gr.Column(visible=True), gr.Column(visible=False)), outputs=[search_page, dataset_page], js=
|
|
|
|
|
|
|
|
|
|
|
|
|
690 |
|
691 |
# Full Dataset Generation
|
692 |
@generate_full_dataset_button.click(
|
693 |
-
inputs=[
|
694 |
-
outputs=[
|
695 |
)
|
696 |
-
def _generate_full_dataset(
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
|
|
|
|
|
|
|
|
701 |
except ValueError as e: raise gr.Error(f"Failed to parse preview: {e}")
|
702 |
|
703 |
refined_preview_df = refine_preview_data(preview_df, mode)
|
@@ -710,9 +828,10 @@ with gr.Blocks(css=css) as demo:
|
|
710 |
|
711 |
# Update UI: show preview, disable generate, show save button
|
712 |
yield {
|
713 |
-
|
714 |
generate_full_dataset_button: gr.Button(interactive=False),
|
715 |
-
save_dataset_button: gr.Button(f"💾 Save {namespace}/{dataset_name}" + (" (private)" if visibility != "public" else ""), visible=True, interactive=False)
|
|
|
716 |
}
|
717 |
|
718 |
# Prepare generation tasks for variants
|
@@ -724,27 +843,32 @@ with gr.Blocks(css=css) as demo:
|
|
724 |
generation_tasks.append({
|
725 |
"func": generate_partial_dataset,
|
726 |
"kwargs": {
|
727 |
-
"title":
|
728 |
"csv_header": csv_header, "output": output_data, "indices_to_generate": indices,
|
729 |
-
"history": [] # Use fresh history for each variant task
|
|
|
730 |
}
|
731 |
})
|
732 |
|
733 |
# Execute tasks in parallel and update UI progressively
|
734 |
for _ in iflatmap_unordered(lambda **kw: kw.pop('func')(**kw), generation_tasks):
|
735 |
-
yield {
|
736 |
|
737 |
yield {save_dataset_button: gr.Button(interactive=True)} # Enable save button
|
738 |
print(f"Full dataset generation complete for {dataset_name}.")
|
739 |
|
740 |
# Save Dataset to Hugging Face Hub
|
741 |
@save_dataset_button.click(
|
742 |
-
inputs=[
|
743 |
outputs=[save_dataset_button, open_dataset_message]
|
744 |
)
|
745 |
-
def _save_dataset(
|
746 |
-
|
747 |
-
|
|
|
|
|
|
|
|
|
748 |
|
749 |
token = oauth_token.token if oauth_token else save_dataset_hf_token
|
750 |
if not token: raise gr.Error("Login required or set SAVE_DATASET_HF_TOKEN.")
|
@@ -760,7 +884,7 @@ with gr.Blocks(css=css) as demo:
|
|
760 |
create_repo(repo_id=repo_id, repo_type="dataset", private=visibility!="public", exist_ok=True, token=token)
|
761 |
df.to_csv(f"hf://datasets/{repo_id}/data.csv", storage_options={"token": token}, index=False)
|
762 |
|
763 |
-
card_content = DATASET_CARD_CONTENT.format(title=
|
764 |
DatasetCard(card_content).push_to_hub(repo_id=repo_id, repo_type="dataset", token=token)
|
765 |
|
766 |
success_msg = f"# 🎉 Yay! Dataset saved to [{repo_id}](https://huggingface.co/datasets/{repo_id})!\n\n_PS: Check Settings to manage your saved datasets._"
|
@@ -772,16 +896,22 @@ with gr.Blocks(css=css) as demo:
|
|
772 |
finally: yield {save_dataset_button: gr.Button(interactive=True)} # Re-enable button
|
773 |
|
774 |
# Shareable URL Generation
|
775 |
-
@dataset_share_button.click(inputs=[
|
776 |
-
def _show_share_url(
|
777 |
-
|
778 |
-
|
|
|
|
|
|
|
|
|
779 |
share_url = f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}"
|
780 |
return gr.Textbox(share_url, visible=True)
|
781 |
|
782 |
# Settings Toggles
|
783 |
refinement_mode.change(lambda mode: gr.Group(visible=(mode == "sourced")), outputs=[source_group])
|
784 |
|
|
|
|
|
785 |
@load_source_button.click(inputs=[source_type, source_path], outputs=[source_status])
|
786 |
def _load_source_data(source_type, source_path):
|
787 |
if not source_path: raise gr.Error("Source path/URL is required.")
|
@@ -792,8 +922,50 @@ with gr.Blocks(css=css) as demo:
|
|
792 |
except (ConnectionError, ValueError, RuntimeError) as e:
|
793 |
raise gr.Error(f"Failed to load source: {str(e)}")
|
794 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
795 |
# Initial App Load Logic
|
796 |
-
@demo.load(outputs=
|
|
|
|
|
797 |
def _load_app(request: gr.Request, oauth_token: Optional[gr.OAuthToken]):
|
798 |
# Handle user login and namespace selection
|
799 |
if oauth_token:
|
@@ -818,13 +990,38 @@ with gr.Blocks(css=css) as demo:
|
|
818 |
# Handle URL parameters for direct search or dataset loading
|
819 |
query_params = dict(request.query_params)
|
820 |
if "dataset" in query_params:
|
821 |
-
|
|
|
|
|
|
|
822 |
elif "q" in query_params:
|
823 |
search_query = query_params["q"]
|
|
|
|
|
824 |
yield {search_bar: search_query}
|
825 |
-
yield
|
|
|
826 |
else:
|
827 |
yield {search_page: gr.Column(visible=True)} # Show search page by default
|
828 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
829 |
if __name__ == "__main__":
|
830 |
demo.launch(share=False, server_name="0.0.0.0")
|
|
|
348 |
print(f"Unexpected LLM error (attempt {attempt+1}): {e}. Retrying...")
|
349 |
time.sleep(2**attempt)
|
350 |
|
351 |
+
def generate_dataset_names(search_query: str, history: list[Dict[str, str]], is_real_data: bool = False, engine: Optional[str] = None) -> Iterator[str]:
|
352 |
"""Generates dataset names based on a search query using the LLM."""
|
353 |
query = search_query[:1000] if search_query else ""
|
354 |
+
|
355 |
+
if is_real_data and engine:
|
356 |
+
prompt = (
|
357 |
+
f"@Claude-3.7-Sonnet You are a data specialist who can transform real search results into structured datasets. "
|
358 |
+
f"A user is searching for data about: \"{query}\" "
|
359 |
+
f"Imagine you've queried {engine} and received real search results. Create a list of {MAX_NB_ITEMS_PER_GENERATION_CALL} specific datasets that could be created from these search results. "
|
360 |
+
f"For each dataset: 1. Give it a clear, specific name related to the search topic. 2. Include 3-5 relevant tags in parentheses, with one tag specifying the ML task type (classification, regression, clustering, etc.). "
|
361 |
+
f"Format each dataset as: 1. DatasetName (tag1, tag2, ml_task_tag). Make these datasets sound like real collections that could be created from {engine} search results on \"{query}\"."
|
362 |
+
)
|
363 |
+
else:
|
364 |
+
prompt = GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY.format(search_query=query)
|
365 |
|
366 |
full_response = ""
|
367 |
for token in stream_response(prompt, history):
|
|
|
372 |
history.append({"role": "assistant", "content": full_response}) # Update history
|
373 |
# No return needed as history is modified in place
|
374 |
|
375 |
+
def generate_dataset_content(search_query: str, dataset_name: str, tags: str, history: list[Dict[str, str]], is_real_data: bool = False, engine: Optional[str] = None) -> Iterator[str]:
|
376 |
"""Generates the description and CSV preview for a dataset."""
|
377 |
query = search_query[:1000] if search_query else ""
|
378 |
+
|
379 |
+
if is_real_data and engine:
|
380 |
+
prompt = (
|
381 |
+
f"@Claude-3.7-Sonnet You're a specialist in converting web search results into structured data. "
|
382 |
+
f"Based on search results from {engine} about \"{query}\", create a preview of the dataset \"{dataset_name}\" with tags \"{tags}\". "
|
383 |
+
f"First, write a detailed description of what this dataset contains, its structure, and how it was constructed from web search results. "
|
384 |
+
f"Then, generate a realistic 5-row CSV preview that resembles data you might get if you scraped and structured real results from {engine}. "
|
385 |
+
f"Format your response with: **Dataset Description:** [detailed description] **CSV Content Preview:** ```csv [CSV header and 5 rows of realistic data] ``` "
|
386 |
+
f"Include relevant columns for the dataset type, with proper labels/categories where appropriate. The data should look like it came from real sources."
|
387 |
+
)
|
388 |
+
else:
|
389 |
+
prompt = GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
|
390 |
+
search_query=query, dataset_name=dataset_name, tags=tags
|
391 |
+
)
|
392 |
|
393 |
full_response = ""
|
394 |
for token in stream_response(prompt, history):
|
|
|
432 |
|
433 |
def generate_partial_dataset(
|
434 |
title: str, content: str, search_query: str, variant: str, csv_header: str,
|
435 |
+
output: list[Optional[dict]], indices_to_generate: list[int], history: list[Dict[str, str]],
|
436 |
+
is_real_data: bool = False, engine: Optional[str] = None
|
437 |
) -> Iterator[int]:
|
438 |
"""Generates a batch of dataset rows for a specific variant."""
|
439 |
dataset_name, tags = title.strip("# ").split("\ntags:", 1)
|
440 |
dataset_name, tags = dataset_name.strip(), tags.strip()
|
441 |
|
442 |
prompt = GENERATE_MORE_ROWS.format(csv_header=csv_header) + " " + variant
|
443 |
+
|
444 |
+
# Construct initial messages for context
|
445 |
+
initial_prompt = ""
|
446 |
+
if is_real_data and engine:
|
447 |
+
initial_prompt = (
|
448 |
+
f"@Claude-3.7-Sonnet You're a specialist in converting web search results into structured data. "
|
449 |
+
f"Based on search results from {engine} about \"{search_query}\", create a preview of the dataset \"{dataset_name}\" with tags \"{tags}\". "
|
450 |
+
f"First, write a detailed description of what this dataset contains, its structure, and how it was constructed from web search results. "
|
451 |
+
f"Then, generate a realistic 5-row CSV preview that resembles data you might get if you scraped and structured real results from {engine}. "
|
452 |
+
f"Format your response with: **Dataset Description:** [detailed description] **CSV Content Preview:** ```csv [CSV header and 5 rows of realistic data] ``` "
|
453 |
+
f"Include relevant columns for the dataset type, with proper labels/categories where appropriate. The data should look like it came from real sources."
|
454 |
+
)
|
455 |
+
else:
|
456 |
+
initial_prompt = GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
|
457 |
+
search_query=search_query, dataset_name=dataset_name, tags=tags
|
458 |
+
)
|
459 |
+
|
460 |
messages = [
|
461 |
+
{"role": "user", "content": initial_prompt},
|
462 |
{"role": "assistant", "content": title + "\n\n" + content},
|
463 |
{"role": "user", "content": prompt},
|
464 |
]
|
|
|
570 |
|
571 |
with gr.Blocks(css=css) as demo:
|
572 |
generated_texts_state = gr.State((landing_page_datasets_generated_text,)) # State for generated dataset names
|
573 |
+
current_dataset_state = gr.State(None) # State to hold current dataset details for generation
|
574 |
+
is_real_data_state = gr.State(True) # State to track if real data is being used
|
575 |
+
current_engine_state = gr.State(None) # State to track the current search engine
|
576 |
+
selected_engines_state = gr.State(["DuckDuckGo.com", "Bing.com", "Search.Yahoo.com", "Search.Brave.com", "Ecosia.org"]) # Default selected engines
|
577 |
+
searchEngines = ["AlltheInternet.com", "DuckDuckGo.com", "Google.com", "Bing.com", "Search.Yahoo.com", "Startpage.com", "Qwant.com", "Ecosia.org", "WolframAlpha.com", "Mojeek.co.uk", "Search.Brave.com", "Yandex.com", "Baidu.com", "Gibiru.com", "MetaGer.org", "Swisscows.com", "Presearch.com", "Ekoru.org", "Search.Lilo.org"]
|
578 |
|
579 |
# --- Search Page UI ---
|
580 |
with gr.Column(visible=True, elem_id="search-page") as search_page:
|
|
|
635 |
label="Dataset Visibility", info="Set visibility for datasets saved to Hugging Face Hub."
|
636 |
)
|
637 |
|
638 |
+
# Search Engine Settings
|
639 |
+
gr.Markdown("Search Engine Configuration")
|
640 |
+
data_source_toggle = gr.Checkbox(label="Use Real Search Data", value=True, info="Toggle to include results from real search engines.")
|
641 |
+
engine_settings_button = gr.Button("Configure Search Engines", icon="https://img.icons8.com/ios-filled/50/000000/settings--v1.png", size="sm")
|
642 |
+
|
643 |
+
# Engine Selection Modal
|
644 |
+
with gr.Modal("Search Engine Settings", id="engine-modal") as engine_modal:
|
645 |
+
gr.Markdown("Select which search engines to use for real data retrieval. A diverse selection improves results.")
|
646 |
+
engine_options_html_comp = gr.HTML(elem_id="engine-options")
|
647 |
+
with gr.Row():
|
648 |
+
select_all_engines_btn = gr.Button("Select All")
|
649 |
+
deselect_all_engines_btn = gr.Button("Deselect All")
|
650 |
+
save_engines_btn = gr.Button("Save Settings", variant="primary")
|
651 |
+
|
652 |
# --- Dataset Detail Page UI ---
|
653 |
with gr.Column(visible=False, elem_id="dataset-page") as dataset_page:
|
654 |
gr.Markdown("# 🤗 Infinite Dataset Hub ♾️\n\nAn endless catalog of datasets, created just for you.")
|
655 |
+
dataset_title_md = gr.Markdown() # Dataset name and tags
|
656 |
+
dataset_source_badge = gr.Markdown() # Badge indicating real/AI data
|
657 |
+
dataset_source_info = gr.Markdown() # Details about the data source
|
658 |
+
dataset_description_md = gr.Markdown() # Dataset description
|
659 |
+
preview_table_comp = gr.DataFrame(visible=False, interactive=False, wrap=True) # Holds the preview CSV
|
660 |
+
|
661 |
+
with gr.Row():
|
662 |
+
generate_full_dataset_button = gr.Button("Generate Full Dataset", variant="primary")
|
663 |
+
save_dataset_button = gr.Button("💾 Save Dataset", variant="primary", visible=False)
|
664 |
+
|
665 |
open_dataset_message = gr.Markdown("", visible=False) # Confirmation message
|
666 |
dataset_share_button = gr.Button("Share Dataset URL")
|
667 |
dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
|
668 |
+
|
669 |
+
full_dataset_section = gr.Column(visible=False) # Container for full dataset and downloads
|
670 |
+
full_table_comp = gr.DataFrame(visible=False, interactive=False, wrap=True)
|
671 |
+
with gr.Row():
|
672 |
+
download_csv_button = gr.Button("Download CSV")
|
673 |
+
download_json_button = gr.Button("Download JSON")
|
674 |
+
download_parquet_button = gr.Button("Download Parquet")
|
675 |
+
|
676 |
back_button = gr.Button("< Back", size="sm")
|
677 |
|
678 |
# --- Event Handlers ---
|
679 |
|
680 |
# Search Logic
|
681 |
+
def _update_search_results(search_query: str, current_generated_texts: tuple[str], is_real_data: bool, engine: Optional[str]):
|
682 |
"""Handles dataset search and UI updates."""
|
683 |
# Reset UI to loading state
|
684 |
yield {btn: gr.Button("⬜⬜⬜⬜⬜⬜", elem_classes="topButton linear-background") for btn in buttons[::2]}
|
|
|
690 |
|
691 |
try:
|
692 |
# Generate dataset names from LLM
|
693 |
+
for line in generate_dataset_names(search_query, [], is_real_data=is_real_data, engine=engine):
|
694 |
if "I'm sorry" in line or "policy" in line: raise gr.Error("Inappropriate content detected.")
|
695 |
if generated_count >= MAX_NB_ITEMS_PER_GENERATION_CALL: break
|
696 |
|
|
|
716 |
except Exception as e: raise gr.Error(f"Failed to generate datasets: {str(e)}")
|
717 |
|
718 |
# Attach search handlers
|
719 |
+
search_button.click(
|
720 |
+
_update_search_results,
|
721 |
+
inputs=[search_bar, generated_texts_state, is_real_data_state, current_engine_state],
|
722 |
+
outputs=buttons + [generated_texts_state] + button_groups
|
723 |
+
)
|
724 |
+
search_bar.submit(
|
725 |
+
_update_search_results,
|
726 |
+
inputs=[search_bar, generated_texts_state, is_real_data_state, current_engine_state],
|
727 |
+
outputs=buttons + [generated_texts_state] + button_groups
|
728 |
+
)
|
729 |
|
730 |
# Load More Datasets
|
731 |
+
load_more_datasets.click(
|
732 |
+
_update_search_results,
|
733 |
+
inputs=[search_bar, generated_texts_state, is_real_data_state, current_engine_state],
|
734 |
+
outputs=buttons + [generated_texts_state] + button_groups
|
735 |
+
)
|
736 |
|
737 |
# Display Single Dataset Details
|
738 |
+
def _show_dataset_details(search_query, dataset_name, tags, is_real_data, engine):
|
739 |
"""Switches to detail view and loads dataset content."""
|
740 |
yield {
|
741 |
search_page: gr.Column(visible=False), dataset_page: gr.Column(visible=True),
|
742 |
+
dataset_title_md: f"# {dataset_name}\n\n tags: {tags}",
|
743 |
+
dataset_share_textbox: gr.Textbox(visible=False),
|
744 |
+
full_dataset_section: gr.Column(visible=False),
|
745 |
+
save_dataset_button: gr.Button(visible=False),
|
746 |
open_dataset_message: gr.Markdown("", visible=False)
|
747 |
}
|
748 |
+
|
749 |
+
# Update source badge and info
|
750 |
+
if is_real_data:
|
751 |
+
badge_html = gr.Markdown(f'<span class="px-3 py-1 rounded-full text-xs font-medium bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200">Real Data</span>', visible=True)
|
752 |
+
info_html = gr.Markdown(f'This dataset is based on real information queried from <strong>{engine}</strong> for the search term "<strong>{search_query}</strong>". The data has been structured for machine learning use.', visible=True)
|
753 |
+
else:
|
754 |
+
badge_html = gr.Markdown('<span class="px-3 py-1 rounded-full text-xs font-medium bg-purple-100 text-purple-800 dark:bg-purple-900 dark:text-purple-200">AI-Generated</span>', visible=True)
|
755 |
+
info_html = gr.Markdown(f'This is an AI-generated dataset created using {model_id}. The content is synthetic and designed to represent plausible data related to "{search_query}".', visible=True)
|
756 |
+
|
757 |
+
yield {dataset_source_badge: badge_html, dataset_source_info: info_html}
|
758 |
+
|
759 |
# Stream content generation
|
760 |
+
for content_chunk in generate_dataset_content(search_query, dataset_name, tags, [], is_real_data=is_real_data, engine=engine):
|
761 |
+
yield {dataset_description_md: content_chunk}
|
762 |
|
763 |
# Link buttons to the detail view function
|
764 |
+
def _show_dataset_from_button_wrapper(search_query, *buttons_values):
|
765 |
+
# Determine which button was clicked to get the index
|
766 |
+
clicked_button_index = -1
|
767 |
+
for i, btn_val in enumerate(buttons_values):
|
768 |
+
if btn_val is not None and btn_val != "": # Assuming non-empty value indicates the clicked button's text
|
769 |
+
clicked_button_index = i
|
770 |
+
break
|
771 |
+
|
772 |
+
if clicked_button_index == -1: return # Should not happen if events are correctly wired
|
773 |
+
|
774 |
+
# Determine if it was a name button (even index) or tag button (odd index)
|
775 |
+
dataset_index = clicked_button_index // 2
|
776 |
+
|
777 |
+
dataset_name, tags = buttons_values[2 * dataset_index], buttons_values[2 * dataset_index + 1]
|
778 |
+
is_real_data = current_engine_state.value is not None # Infer from engine state
|
779 |
+
engine = current_engine_state.value if is_real_data else None
|
780 |
+
|
781 |
+
yield from _show_dataset_details(search_query, dataset_name, tags, is_real_data, engine)
|
782 |
|
783 |
+
# Wire up click events for all dataset name and tag buttons
|
784 |
for i, (name_btn, tag_btn) in enumerate(batched(buttons, 2)):
|
785 |
+
name_btn.click(
|
786 |
+
partial(_show_dataset_from_button_wrapper),
|
787 |
+
inputs=[search_bar, *buttons],
|
788 |
+
outputs=[search_page, dataset_page, dataset_title_md, dataset_description_md, dataset_source_badge, dataset_source_info, dataset_share_textbox, full_dataset_section, save_dataset_button, open_dataset_message]
|
789 |
+
)
|
790 |
+
tag_btn.click(
|
791 |
+
partial(_show_dataset_from_button_wrapper),
|
792 |
+
inputs=[search_bar, *buttons],
|
793 |
+
outputs=[search_page, dataset_page, dataset_title_md, dataset_description_md, dataset_source_badge, dataset_source_info, dataset_share_textbox, full_dataset_section, save_dataset_button, open_dataset_message]
|
794 |
+
)
|
795 |
|
796 |
# Back Button Navigation
|
797 |
+
back_button.click(lambda: (gr.Column(visible=True), gr.Column(visible=False)), outputs=[search_page, dataset_page], js="""
|
798 |
+
function() {
|
799 |
+
if ('parentIFrame' in window) { window.parentIFrame.scrollTo({top: 0, behavior:'smooth'}); }
|
800 |
+
else { window.scrollTo({ top: 0, behavior: 'smooth' }); }
|
801 |
+
return Array.from(arguments);
|
802 |
+
}
|
803 |
+
""")
|
804 |
|
805 |
# Full Dataset Generation
|
806 |
@generate_full_dataset_button.click(
|
807 |
+
inputs=[dataset_title_md, dataset_description_md, search_bar, select_namespace_dropdown, visibility_radio, refinement_mode, is_real_data_state, current_engine_state],
|
808 |
+
outputs=[full_table_comp, generate_full_dataset_button, save_dataset_button, full_dataset_section]
|
809 |
)
|
810 |
+
def _generate_full_dataset(title_md, content_md, search_query, namespace, visibility, mode, is_real_data, engine):
|
811 |
+
# Extract dataset name and tags from the markdown title
|
812 |
+
try:
|
813 |
+
dataset_name = title_md.split('\n')[0].strip('# ')
|
814 |
+
tags = title_md.split('tags:', 1)[1].strip()
|
815 |
+
except IndexError:
|
816 |
+
raise gr.Error("Could not parse dataset title.")
|
817 |
+
|
818 |
+
try: csv_header, preview_df = parse_preview_df(content_md)
|
819 |
except ValueError as e: raise gr.Error(f"Failed to parse preview: {e}")
|
820 |
|
821 |
refined_preview_df = refine_preview_data(preview_df, mode)
|
|
|
828 |
|
829 |
# Update UI: show preview, disable generate, show save button
|
830 |
yield {
|
831 |
+
full_table_comp: gr.DataFrame(pd.DataFrame([r for r in output_data if r]), visible=True),
|
832 |
generate_full_dataset_button: gr.Button(interactive=False),
|
833 |
+
save_dataset_button: gr.Button(f"💾 Save {namespace}/{dataset_name}" + (" (private)" if visibility != "public" else ""), visible=True, interactive=False),
|
834 |
+
full_dataset_section: gr.Column(visible=True)
|
835 |
}
|
836 |
|
837 |
# Prepare generation tasks for variants
|
|
|
843 |
generation_tasks.append({
|
844 |
"func": generate_partial_dataset,
|
845 |
"kwargs": {
|
846 |
+
"title": title_md, "content": content_md, "search_query": search_query, "variant": variant,
|
847 |
"csv_header": csv_header, "output": output_data, "indices_to_generate": indices,
|
848 |
+
"history": [], # Use fresh history for each variant task
|
849 |
+
"is_real_data": is_real_data, "engine": engine
|
850 |
}
|
851 |
})
|
852 |
|
853 |
# Execute tasks in parallel and update UI progressively
|
854 |
for _ in iflatmap_unordered(lambda **kw: kw.pop('func')(**kw), generation_tasks):
|
855 |
+
yield {full_table_comp: pd.DataFrame([r for r in output_data if r])} # Update DataFrame display
|
856 |
|
857 |
yield {save_dataset_button: gr.Button(interactive=True)} # Enable save button
|
858 |
print(f"Full dataset generation complete for {dataset_name}.")
|
859 |
|
860 |
# Save Dataset to Hugging Face Hub
|
861 |
@save_dataset_button.click(
|
862 |
+
inputs=[dataset_title_md, dataset_description_md, search_bar, full_table_comp, select_namespace_dropdown, visibility_radio],
|
863 |
outputs=[save_dataset_button, open_dataset_message]
|
864 |
)
|
865 |
+
def _save_dataset(title_md, content_md, search_query, df, namespace, visibility, oauth_token):
|
866 |
+
# Extract dataset name and tags from the markdown title
|
867 |
+
try:
|
868 |
+
dataset_name = title_md.split('\n')[0].strip('# ')
|
869 |
+
tags = title_md.split('tags:', 1)[1].strip()
|
870 |
+
except IndexError:
|
871 |
+
raise gr.Error("Could not parse dataset title.")
|
872 |
|
873 |
token = oauth_token.token if oauth_token else save_dataset_hf_token
|
874 |
if not token: raise gr.Error("Login required or set SAVE_DATASET_HF_TOKEN.")
|
|
|
884 |
create_repo(repo_id=repo_id, repo_type="dataset", private=visibility!="public", exist_ok=True, token=token)
|
885 |
df.to_csv(f"hf://datasets/{repo_id}/data.csv", storage_options={"token": token}, index=False)
|
886 |
|
887 |
+
card_content = DATASET_CARD_CONTENT.format(title=title_md, content=content_md, url=URL, dataset_url=dataset_url, model_id=model_id, search_query=search_query)
|
888 |
DatasetCard(card_content).push_to_hub(repo_id=repo_id, repo_type="dataset", token=token)
|
889 |
|
890 |
success_msg = f"# 🎉 Yay! Dataset saved to [{repo_id}](https://huggingface.co/datasets/{repo_id})!\n\n_PS: Check Settings to manage your saved datasets._"
|
|
|
896 |
finally: yield {save_dataset_button: gr.Button(interactive=True)} # Re-enable button
|
897 |
|
898 |
# Shareable URL Generation
|
899 |
+
@dataset_share_button.click(inputs=[dataset_title_md, search_bar], outputs=[dataset_share_textbox])
|
900 |
+
def _show_share_url(title_md, search_query):
|
901 |
+
try:
|
902 |
+
dataset_name = title_md.split('\n')[0].strip('# ')
|
903 |
+
tags = title_md.split('tags:', 1)[1].strip()
|
904 |
+
except IndexError:
|
905 |
+
raise gr.Error("Could not parse dataset title.")
|
906 |
+
|
907 |
share_url = f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}"
|
908 |
return gr.Textbox(share_url, visible=True)
|
909 |
|
910 |
# Settings Toggles
|
911 |
refinement_mode.change(lambda mode: gr.Group(visible=(mode == "sourced")), outputs=[source_group])
|
912 |
|
913 |
+
data_source_toggle.change(lambda value: (gr.State(value), gr.State(value if value else None)), inputs=[data_source_toggle], outputs=[is_real_data_state, current_engine_state])
|
914 |
+
|
915 |
@load_source_button.click(inputs=[source_type, source_path], outputs=[source_status])
|
916 |
def _load_source_data(source_type, source_path):
|
917 |
if not source_path: raise gr.Error("Source path/URL is required.")
|
|
|
922 |
except (ConnectionError, ValueError, RuntimeError) as e:
|
923 |
raise gr.Error(f"Failed to load source: {str(e)}")
|
924 |
|
925 |
+
# Engine Settings Modal Logic
|
926 |
+
def _populate_engine_options(selected_engines):
|
927 |
+
engine_options_html = ""
|
928 |
+
for engine in searchEngines:
|
929 |
+
is_checked = "checked" if engine in selected_engines else ""
|
930 |
+
engine_options_html += f"""
|
931 |
+
<div class="flex items-center">
|
932 |
+
<input type="checkbox" id="engine-{engine.replace('.', '_')}" class="engine-checkbox mr-2 h-4 w-4" value="{engine}" {is_checked}>
|
933 |
+
<label for="engine-{engine.replace('.', '_')}" class="cursor-pointer">{engine}</label>
|
934 |
+
</div>
|
935 |
+
"""
|
936 |
+
return gr.HTML(engine_options_html)
|
937 |
+
|
938 |
+
def _save_engine_settings(selected_engines_json):
|
939 |
+
selected_engines = json.loads(selected_engines_json)
|
940 |
+
if not selected_engines:
|
941 |
+
gr.Warning("At least one search engine must be selected. Using DuckDuckGo as default.")
|
942 |
+
selected_engines = ["DuckDuckGo.com"]
|
943 |
+
|
944 |
+
current_engine = selected_engines[0] if selected_engines else None
|
945 |
+
return gr.State(selected_engines), gr.State(current_engine), gr.Info(f"Updated search engines. Using {len(selected_engines)} engines.")
|
946 |
+
|
947 |
+
# Initialize engine options component
|
948 |
+
engine_options_html_comp = _populate_engine_options(selected_engines_state.value)
|
949 |
+
|
950 |
+
# Update engine options when the modal is opened
|
951 |
+
engine_settings_button.click(lambda: engine_options_html_comp.update(_populate_engine_options(selected_engines_state.value)), outputs=[engine_options_html_comp])
|
952 |
+
|
953 |
+
select_all_engines_btn.click(lambda: engine_options_html_comp.update(_populate_engine_options(searchEngines)), outputs=[engine_options_html_comp])
|
954 |
+
deselect_all_engines_btn.click(lambda: engine_options_html_comp.update(_populate_engine_options([])), outputs=[engine_options_html_comp])
|
955 |
+
|
956 |
+
save_engines_btn.click(
|
957 |
+
_save_engine_settings,
|
958 |
+
inputs=[gr.JSON(elem_id="engine-options")], # Capture checked engines from modal
|
959 |
+
outputs=[selected_engines_state, current_engine_state, gr.Info()]
|
960 |
+
)
|
961 |
+
|
962 |
+
engine_settings_button.click(lambda: engine_modal.update(visible=True), outputs=[engine_modal])
|
963 |
+
# Close modal on save or when clicking outside (implicit via Gradio's modal handling)
|
964 |
+
|
965 |
# Initial App Load Logic
|
966 |
+
@demo.load(outputs=([search_page, dataset_page, dataset_title_md, dataset_description_md, dataset_source_badge, dataset_source_info, dataset_share_textbox, full_dataset_section, save_dataset_button, open_dataset_message, search_bar] + # Outputs for detail page and search bar
|
967 |
+
buttons + [generated_texts_state] + # Outputs for search results buttons and state
|
968 |
+
[select_namespace_dropdown, visibility_radio, source_group, data_source_toggle, current_engine_state, selected_engines_state, engine_options_html_comp])) # Outputs for settings
|
969 |
def _load_app(request: gr.Request, oauth_token: Optional[gr.OAuthToken]):
|
970 |
# Handle user login and namespace selection
|
971 |
if oauth_token:
|
|
|
990 |
# Handle URL parameters for direct search or dataset loading
|
991 |
query_params = dict(request.query_params)
|
992 |
if "dataset" in query_params:
|
993 |
+
is_real = query_params.get("engine") is not None
|
994 |
+
engine = query_params.get("engine")
|
995 |
+
yield from _show_dataset_details(query_params.get("q", query_params["dataset"]), query_params["dataset"], query_params.get("tags", ""), is_real, engine)
|
996 |
+
yield {is_real_data_state: is_real, current_engine_state: engine}
|
997 |
elif "q" in query_params:
|
998 |
search_query = query_params["q"]
|
999 |
+
is_real = query_params.get("engine") is not None
|
1000 |
+
engine = query_params.get("engine")
|
1001 |
yield {search_bar: search_query}
|
1002 |
+
yield {is_real_data_state: is_real, current_engine_state: engine}
|
1003 |
+
yield from _update_search_results(search_query, (), is_real, engine)
|
1004 |
else:
|
1005 |
yield {search_page: gr.Column(visible=True)} # Show search page by default
|
1006 |
|
1007 |
+
# Initialize with default datasets
|
1008 |
+
initial_outputs = {}
|
1009 |
+
for i, line in enumerate(default_output):
|
1010 |
+
try: dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" (", 1)
|
1011 |
+
except ValueError: dataset_name, tags = line.split(".", 1)[1].strip(" )").split(" ", 1)[0], ""
|
1012 |
+
|
1013 |
+
initial_outputs[buttons[2 * i]] = gr.Button(dataset_name, elem_classes="topButton")
|
1014 |
+
initial_outputs[buttons[2 * i + 1]] = gr.Button(tags, elem_classes="bottomButton")
|
1015 |
+
initial_outputs[button_groups[i]] = gr.Group(elem_classes="buttonsGroup")
|
1016 |
+
yield initial_outputs
|
1017 |
+
yield {generated_texts_state: (landing_page_datasets_generated_text,)}
|
1018 |
+
|
1019 |
+
# Initialize engine settings UI
|
1020 |
+
yield {
|
1021 |
+
data_source_toggle: gr.Checkbox(value=is_real_data_state.value),
|
1022 |
+
engine_options_html_comp: _populate_engine_options(selected_engines_state.value)
|
1023 |
+
}
|
1024 |
+
|
1025 |
+
|
1026 |
if __name__ == "__main__":
|
1027 |
demo.launch(share=False, server_name="0.0.0.0")
|