Spaces:
Runtime error
Runtime error
Commit
ยท
c4435ca
1
Parent(s):
86f370f
fix examples for evaluation
Browse files
pdm.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[project]
|
| 2 |
name = "synthetic-dataset-generator"
|
| 3 |
-
version = "0.1.
|
| 4 |
description = "Build datasets using natural language"
|
| 5 |
authors = [
|
| 6 |
{name = "davidberenstein1957", email = "[email protected]"},
|
|
|
|
| 1 |
[project]
|
| 2 |
name = "synthetic-dataset-generator"
|
| 3 |
+
version = "0.1.2"
|
| 4 |
description = "Build datasets using natural language"
|
| 5 |
authors = [
|
| 6 |
{name = "davidberenstein1957", email = "[email protected]"},
|
src/synthetic_dataset_generator/app.py
CHANGED
|
@@ -15,6 +15,9 @@ button[role="tab"][aria-selected="true"]:hover {border-color: var(--button-prima
|
|
| 15 |
#system_prompt_examples { color: var(--body-text-color) !important; background-color: var(--block-background-fill) !important;}
|
| 16 |
.container {padding-inline: 0 !important}
|
| 17 |
#sign_in_button { flex-grow: 0; width: 50% !important; display: flex; align-items: center; justify-content: center; margin: 0 auto; }
|
|
|
|
|
|
|
|
|
|
| 18 |
"""
|
| 19 |
|
| 20 |
image = """<br><img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo.svg" alt="Synthetic Data Generator Logo" style="display: block; margin-left: auto; margin-right: auto; width: clamp(50%, 400px, 100%)"/>"""
|
|
|
|
| 15 |
#system_prompt_examples { color: var(--body-text-color) !important; background-color: var(--block-background-fill) !important;}
|
| 16 |
.container {padding-inline: 0 !important}
|
| 17 |
#sign_in_button { flex-grow: 0; width: 50% !important; display: flex; align-items: center; justify-content: center; margin: 0 auto; }
|
| 18 |
+
.table-view .table-wrap {
|
| 19 |
+
max-height: 450px;
|
| 20 |
+
}
|
| 21 |
"""
|
| 22 |
|
| 23 |
image = """<br><img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo.svg" alt="Synthetic Data Generator Logo" style="display: block; margin-left: auto; margin-right: auto; width: clamp(50%, 400px, 100%)"/>"""
|
src/synthetic_dataset_generator/apps/eval.py
CHANGED
|
@@ -89,22 +89,72 @@ def load_dataset_from_hub(
|
|
| 89 |
if not repo_id:
|
| 90 |
raise gr.Error("Hub repo id is required")
|
| 91 |
subsets = get_dataset_config_names(repo_id, token=token)
|
| 92 |
-
ds_dict = load_dataset(repo_id, subsets[0], token=token)
|
| 93 |
splits = get_dataset_split_names(repo_id, subsets[0], token=token)
|
| 94 |
-
ds =
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
dataframe = ds.to_pandas()
|
| 98 |
instruction_valid_columns, response_valid_columns = get_valid_columns(dataframe)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
return (
|
| 100 |
dataframe,
|
| 101 |
-
gr.Dropdown(
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
)
|
| 104 |
|
| 105 |
|
| 106 |
def define_evaluation_aspects(task_type: str):
|
| 107 |
-
if task_type == "
|
| 108 |
return gr.Dropdown(
|
| 109 |
value=["overall-rating"],
|
| 110 |
choices=["helpfulness", "truthfulness", "overall-rating", "honesty"],
|
|
@@ -251,7 +301,7 @@ def _evaluate_dataset(
|
|
| 251 |
num_rows: int = 10,
|
| 252 |
is_sample: bool = False,
|
| 253 |
):
|
| 254 |
-
if eval_type == "
|
| 255 |
dataframe = evaluate_instruction_response(
|
| 256 |
dataframe=dataframe,
|
| 257 |
aspects=aspects_instruction_response,
|
|
@@ -280,7 +330,7 @@ def evaluate_sample_dataset(
|
|
| 280 |
prompt_template: str,
|
| 281 |
structured_output: dict,
|
| 282 |
):
|
| 283 |
-
dataframe, _, _ = load_dataset_from_hub(repo_id, num_rows=10)
|
| 284 |
dataframe = _evaluate_dataset(
|
| 285 |
dataframe=dataframe,
|
| 286 |
eval_type=eval_type,
|
|
@@ -324,7 +374,7 @@ def push_dataset(
|
|
| 324 |
oauth_token: Union[gr.OAuthToken, None] = None,
|
| 325 |
progress=gr.Progress(),
|
| 326 |
) -> pd.DataFrame:
|
| 327 |
-
dataframe, _, _ = load_dataset_from_hub(original_repo_id, num_rows=num_rows)
|
| 328 |
dataframe = _evaluate_dataset(
|
| 329 |
dataframe=dataframe,
|
| 330 |
eval_type=eval_type,
|
|
@@ -342,7 +392,7 @@ def push_dataset(
|
|
| 342 |
client = get_argilla_client()
|
| 343 |
if client is None:
|
| 344 |
return ""
|
| 345 |
-
if eval_type == "
|
| 346 |
num_generations = len((dataframe["generations"][0]))
|
| 347 |
fields = [
|
| 348 |
rg.ChatField(
|
|
@@ -612,7 +662,18 @@ with gr.Blocks() as app:
|
|
| 612 |
load_btn = gr.Button("Load", variant="primary")
|
| 613 |
|
| 614 |
with gr.Column(scale=3):
|
| 615 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 616 |
|
| 617 |
gr.HTML(value="<hr>")
|
| 618 |
gr.Markdown(value="## 2. Configure your task")
|
|
@@ -620,58 +681,54 @@ with gr.Blocks() as app:
|
|
| 620 |
with gr.Column(scale=2):
|
| 621 |
eval_type = gr.Dropdown(
|
| 622 |
label="Evaluation type",
|
| 623 |
-
choices=["
|
| 624 |
-
value="
|
| 625 |
multiselect=False,
|
| 626 |
visible=False,
|
| 627 |
)
|
| 628 |
-
with gr.Tab("
|
| 629 |
aspects_instruction_response = define_evaluation_aspects(
|
| 630 |
-
"
|
| 631 |
)
|
| 632 |
instruction_instruction_response = gr.Dropdown(
|
| 633 |
label="Instruction Column",
|
| 634 |
-
|
|
|
|
|
|
|
|
|
|
| 635 |
multiselect=False,
|
| 636 |
allow_custom_value=False,
|
| 637 |
)
|
| 638 |
response_instruction_response = gr.Dropdown(
|
| 639 |
label="Response Column",
|
| 640 |
-
|
| 641 |
-
|
|
|
|
|
|
|
|
|
|
| 642 |
allow_custom_value=False,
|
| 643 |
)
|
| 644 |
tab_instruction_response.select(
|
| 645 |
-
fn=lambda: "
|
| 646 |
inputs=[],
|
| 647 |
outputs=[eval_type],
|
| 648 |
)
|
| 649 |
-
with gr.Tab("
|
| 650 |
-
aspects_custom = define_evaluation_aspects("custom")
|
| 651 |
prompt_template = gr.Code(
|
| 652 |
label="Prompt template",
|
| 653 |
-
value="
|
| 654 |
language="markdown",
|
| 655 |
-
interactive=
|
| 656 |
)
|
| 657 |
structured_output = gr.Code(
|
| 658 |
label="Structured output",
|
| 659 |
-
value=
|
| 660 |
-
{
|
| 661 |
-
"type": "object",
|
| 662 |
-
"properties": {
|
| 663 |
-
"quality": {"type": "integer"},
|
| 664 |
-
"clarity": {"type": "integer"},
|
| 665 |
-
"relevance": {"type": "integer"},
|
| 666 |
-
},
|
| 667 |
-
},
|
| 668 |
-
indent=4,
|
| 669 |
-
),
|
| 670 |
language="json",
|
| 671 |
-
interactive=
|
| 672 |
)
|
| 673 |
tab_custom.select(
|
| 674 |
-
fn=lambda: "custom",
|
| 675 |
inputs=[],
|
| 676 |
outputs=[eval_type],
|
| 677 |
)
|
|
@@ -681,9 +738,10 @@ with gr.Blocks() as app:
|
|
| 681 |
with gr.Column(scale=3):
|
| 682 |
dataframe = gr.Dataframe(
|
| 683 |
headers=["prompt", "completion", "evaluation"],
|
| 684 |
-
wrap=
|
| 685 |
height=500,
|
| 686 |
interactive=False,
|
|
|
|
| 687 |
)
|
| 688 |
|
| 689 |
gr.HTML(value="<hr>")
|
|
@@ -746,6 +804,8 @@ with gr.Blocks() as app:
|
|
| 746 |
dataframe,
|
| 747 |
instruction_instruction_response,
|
| 748 |
response_instruction_response,
|
|
|
|
|
|
|
| 749 |
],
|
| 750 |
)
|
| 751 |
|
|
|
|
| 89 |
if not repo_id:
|
| 90 |
raise gr.Error("Hub repo id is required")
|
| 91 |
subsets = get_dataset_config_names(repo_id, token=token)
|
|
|
|
| 92 |
splits = get_dataset_split_names(repo_id, subsets[0], token=token)
|
| 93 |
+
ds = load_dataset(repo_id, subsets[0], split=splits[0], token=token, streaming=True)
|
| 94 |
+
rows = []
|
| 95 |
+
for idx, row in enumerate(ds):
|
| 96 |
+
rows.append(row)
|
| 97 |
+
if idx == num_rows:
|
| 98 |
+
break
|
| 99 |
+
ds = Dataset.from_list(rows)
|
| 100 |
dataframe = ds.to_pandas()
|
| 101 |
instruction_valid_columns, response_valid_columns = get_valid_columns(dataframe)
|
| 102 |
+
col_instruction = instruction_valid_columns[0] if instruction_valid_columns else ""
|
| 103 |
+
col_response = "No valid response columns found."
|
| 104 |
+
for col in response_valid_columns:
|
| 105 |
+
if col != col_instruction:
|
| 106 |
+
col_response = col
|
| 107 |
+
break
|
| 108 |
+
|
| 109 |
+
prompt_template = gr.Code(
|
| 110 |
+
label="Prompt template",
|
| 111 |
+
value="\n".join(
|
| 112 |
+
[
|
| 113 |
+
"Evaluate the following text based on criteria.",
|
| 114 |
+
"Criteria: quality.",
|
| 115 |
+
"Score: between 1 and 10.",
|
| 116 |
+
"Text: {{" + col_response + "}}",
|
| 117 |
+
]
|
| 118 |
+
),
|
| 119 |
+
language="markdown",
|
| 120 |
+
interactive=True,
|
| 121 |
+
)
|
| 122 |
+
structured_output = gr.Code(
|
| 123 |
+
label="Structured output",
|
| 124 |
+
value=json.dumps(
|
| 125 |
+
{
|
| 126 |
+
"type": "object",
|
| 127 |
+
"properties": {"quality": {"type": "integer"}},
|
| 128 |
+
"required": ["quality"],
|
| 129 |
+
},
|
| 130 |
+
indent=4,
|
| 131 |
+
),
|
| 132 |
+
language="json",
|
| 133 |
+
interactive=True,
|
| 134 |
+
)
|
| 135 |
return (
|
| 136 |
dataframe,
|
| 137 |
+
gr.Dropdown(
|
| 138 |
+
choices=instruction_valid_columns,
|
| 139 |
+
label="Instruction column",
|
| 140 |
+
value=col_instruction,
|
| 141 |
+
interactive=True,
|
| 142 |
+
),
|
| 143 |
+
gr.Dropdown(
|
| 144 |
+
choices=response_valid_columns,
|
| 145 |
+
label="Response column",
|
| 146 |
+
value=col_response,
|
| 147 |
+
interactive=False
|
| 148 |
+
if col_response == "No valid response columns found."
|
| 149 |
+
else True,
|
| 150 |
+
),
|
| 151 |
+
prompt_template,
|
| 152 |
+
structured_output,
|
| 153 |
)
|
| 154 |
|
| 155 |
|
| 156 |
def define_evaluation_aspects(task_type: str):
|
| 157 |
+
if task_type == "chat-eval":
|
| 158 |
return gr.Dropdown(
|
| 159 |
value=["overall-rating"],
|
| 160 |
choices=["helpfulness", "truthfulness", "overall-rating", "honesty"],
|
|
|
|
| 301 |
num_rows: int = 10,
|
| 302 |
is_sample: bool = False,
|
| 303 |
):
|
| 304 |
+
if eval_type == "chat-eval":
|
| 305 |
dataframe = evaluate_instruction_response(
|
| 306 |
dataframe=dataframe,
|
| 307 |
aspects=aspects_instruction_response,
|
|
|
|
| 330 |
prompt_template: str,
|
| 331 |
structured_output: dict,
|
| 332 |
):
|
| 333 |
+
dataframe, _, _, _, _ = load_dataset_from_hub(repo_id, num_rows=10)
|
| 334 |
dataframe = _evaluate_dataset(
|
| 335 |
dataframe=dataframe,
|
| 336 |
eval_type=eval_type,
|
|
|
|
| 374 |
oauth_token: Union[gr.OAuthToken, None] = None,
|
| 375 |
progress=gr.Progress(),
|
| 376 |
) -> pd.DataFrame:
|
| 377 |
+
dataframe, _, _, _, _ = load_dataset_from_hub(original_repo_id, num_rows=num_rows)
|
| 378 |
dataframe = _evaluate_dataset(
|
| 379 |
dataframe=dataframe,
|
| 380 |
eval_type=eval_type,
|
|
|
|
| 392 |
client = get_argilla_client()
|
| 393 |
if client is None:
|
| 394 |
return ""
|
| 395 |
+
if eval_type == "chat-eval":
|
| 396 |
num_generations = len((dataframe["generations"][0]))
|
| 397 |
fields = [
|
| 398 |
rg.ChatField(
|
|
|
|
| 662 |
load_btn = gr.Button("Load", variant="primary")
|
| 663 |
|
| 664 |
with gr.Column(scale=3):
|
| 665 |
+
examples = gr.Examples(
|
| 666 |
+
examples=[
|
| 667 |
+
"argilla/distilabel-sft-easy",
|
| 668 |
+
"HuggingFaceFW/fineweb-edu",
|
| 669 |
+
"argilla/distilabel-intel-orca-dpo-pairs",
|
| 670 |
+
],
|
| 671 |
+
label="Example datasets",
|
| 672 |
+
fn=lambda x: x,
|
| 673 |
+
inputs=[search_in],
|
| 674 |
+
run_on_click=True,
|
| 675 |
+
)
|
| 676 |
+
search_out = gr.HTML(label="Dataset preview", visible=False)
|
| 677 |
|
| 678 |
gr.HTML(value="<hr>")
|
| 679 |
gr.Markdown(value="## 2. Configure your task")
|
|
|
|
| 681 |
with gr.Column(scale=2):
|
| 682 |
eval_type = gr.Dropdown(
|
| 683 |
label="Evaluation type",
|
| 684 |
+
choices=["chat-eval", "custom-eval"],
|
| 685 |
+
value="chat-eval",
|
| 686 |
multiselect=False,
|
| 687 |
visible=False,
|
| 688 |
)
|
| 689 |
+
with gr.Tab("Response Evaluation") as tab_instruction_response:
|
| 690 |
aspects_instruction_response = define_evaluation_aspects(
|
| 691 |
+
"chat-eval"
|
| 692 |
)
|
| 693 |
instruction_instruction_response = gr.Dropdown(
|
| 694 |
label="Instruction Column",
|
| 695 |
+
info="Select the instruction column to evaluate",
|
| 696 |
+
choices=["Load your data first in step 1."],
|
| 697 |
+
value="Load your data first in step 1.",
|
| 698 |
+
interactive=False,
|
| 699 |
multiselect=False,
|
| 700 |
allow_custom_value=False,
|
| 701 |
)
|
| 702 |
response_instruction_response = gr.Dropdown(
|
| 703 |
label="Response Column",
|
| 704 |
+
info="Select the response column(s) to evaluate",
|
| 705 |
+
choices=["Load your data first in step 1."],
|
| 706 |
+
value="Load your data first in step 1.",
|
| 707 |
+
interactive=False,
|
| 708 |
+
multiselect=False,
|
| 709 |
allow_custom_value=False,
|
| 710 |
)
|
| 711 |
tab_instruction_response.select(
|
| 712 |
+
fn=lambda: "chat-eval",
|
| 713 |
inputs=[],
|
| 714 |
outputs=[eval_type],
|
| 715 |
)
|
| 716 |
+
with gr.Tab("Custom Evaluation Prompt") as tab_custom:
|
| 717 |
+
aspects_custom = define_evaluation_aspects("custom-eval")
|
| 718 |
prompt_template = gr.Code(
|
| 719 |
label="Prompt template",
|
| 720 |
+
value="Load your data first in step 1.",
|
| 721 |
language="markdown",
|
| 722 |
+
interactive=False,
|
| 723 |
)
|
| 724 |
structured_output = gr.Code(
|
| 725 |
label="Structured output",
|
| 726 |
+
value="Load your data first in step 1.",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 727 |
language="json",
|
| 728 |
+
interactive=False,
|
| 729 |
)
|
| 730 |
tab_custom.select(
|
| 731 |
+
fn=lambda: "custom-eval",
|
| 732 |
inputs=[],
|
| 733 |
outputs=[eval_type],
|
| 734 |
)
|
|
|
|
| 738 |
with gr.Column(scale=3):
|
| 739 |
dataframe = gr.Dataframe(
|
| 740 |
headers=["prompt", "completion", "evaluation"],
|
| 741 |
+
wrap=True,
|
| 742 |
height=500,
|
| 743 |
interactive=False,
|
| 744 |
+
elem_classes="table-view",
|
| 745 |
)
|
| 746 |
|
| 747 |
gr.HTML(value="<hr>")
|
|
|
|
| 804 |
dataframe,
|
| 805 |
instruction_instruction_response,
|
| 806 |
response_instruction_response,
|
| 807 |
+
prompt_template,
|
| 808 |
+
structured_output,
|
| 809 |
],
|
| 810 |
)
|
| 811 |
|
src/synthetic_dataset_generator/apps/sft.py
CHANGED
|
@@ -84,6 +84,7 @@ def _get_dataframe():
|
|
| 84 |
wrap=True,
|
| 85 |
height=500,
|
| 86 |
interactive=False,
|
|
|
|
| 87 |
)
|
| 88 |
|
| 89 |
|
|
|
|
| 84 |
wrap=True,
|
| 85 |
height=500,
|
| 86 |
interactive=False,
|
| 87 |
+
elem_classes="table-view",
|
| 88 |
)
|
| 89 |
|
| 90 |
|
src/synthetic_dataset_generator/apps/textcat.py
CHANGED
|
@@ -37,7 +37,11 @@ from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE
|
|
| 37 |
|
| 38 |
def _get_dataframe():
|
| 39 |
return gr.Dataframe(
|
| 40 |
-
headers=["labels", "text"],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
)
|
| 42 |
|
| 43 |
|
|
|
|
| 37 |
|
| 38 |
def _get_dataframe():
|
| 39 |
return gr.Dataframe(
|
| 40 |
+
headers=["labels", "text"],
|
| 41 |
+
wrap=True,
|
| 42 |
+
height=500,
|
| 43 |
+
interactive=False,
|
| 44 |
+
elem_classes="table-view",
|
| 45 |
)
|
| 46 |
|
| 47 |
|
src/synthetic_dataset_generator/pipelines/eval.py
CHANGED
|
@@ -18,7 +18,7 @@ def get_ultrafeedback_evaluator(aspect, is_sample):
|
|
| 18 |
api_key=_get_next_api_key(),
|
| 19 |
generation_kwargs={
|
| 20 |
"temperature": 0.01,
|
| 21 |
-
"max_new_tokens":
|
| 22 |
},
|
| 23 |
),
|
| 24 |
aspect=aspect,
|
|
@@ -36,7 +36,7 @@ def get_custom_evaluator(prompt_template, structured_output, columns, is_sample)
|
|
| 36 |
structured_output={"format": "json", "schema": structured_output},
|
| 37 |
generation_kwargs={
|
| 38 |
"temperature": 0.01,
|
| 39 |
-
"max_new_tokens":
|
| 40 |
},
|
| 41 |
),
|
| 42 |
template=prompt_template,
|
|
|
|
| 18 |
api_key=_get_next_api_key(),
|
| 19 |
generation_kwargs={
|
| 20 |
"temperature": 0.01,
|
| 21 |
+
"max_new_tokens": 2048 if not is_sample else 512,
|
| 22 |
},
|
| 23 |
),
|
| 24 |
aspect=aspect,
|
|
|
|
| 36 |
structured_output={"format": "json", "schema": structured_output},
|
| 37 |
generation_kwargs={
|
| 38 |
"temperature": 0.01,
|
| 39 |
+
"max_new_tokens": 2048 if not is_sample else 512,
|
| 40 |
},
|
| 41 |
),
|
| 42 |
template=prompt_template,
|