synthetic-data-generator

Runtime error

App Files Files Community

davidberenstein1957 commited on Oct 16, 2024

Commit

b445efe

1 Parent(s): 35eb40e

feat: Add validation to check if data exists before pushing/generating

Browse files

Files changed (3) hide show

src/distilabel_dataset_generator/apps/sft.py +91 -55
src/distilabel_dataset_generator/pipelines/sft.py +3 -3
src/distilabel_dataset_generator/utils.py +11 -7

src/distilabel_dataset_generator/apps/sft.py CHANGED Viewed

@@ -267,17 +267,19 @@ def push_to_argilla(
                     ),
                 ],
                 questions=[
-                    rg.TextQuestion(
-                        name="correct_response",
-                        description="The corrected response from the assistant",
                     ),
                 ],
                 metadata=[
                     rg.IntegerMetadataProperty(
-                        name="messages_length", title="Messages Length"
                     ),
                     rg.IntegerMetadataProperty(
-                        name="response_length", title="Response Length"
                     ),
                 ],
                 vectors=[
@@ -288,25 +290,28 @@ def push_to_argilla(
                 ],
                 guidelines="Please review the conversation and provide a score for the assistant's response.",
             )
-            import pdb
-            pdb.set_trace()
-            dataframe["messages_length"] = dataframe["messages"].apply(
-                lambda x: sum([len(y["content"]) for y in x])
             )
             dataframe["messages_embeddings"] = get_embeddings(
                 dataframe["messages"].apply(
                     lambda x: " ".join([y["content"] for y in x])
                 )
             )
-            dataframe["correct_response"] = dataframe["messages"].apply(
-                lambda x: x[-1]["content"]
-            )
-            dataframe["response_length"] = dataframe["correct_response"].apply(len)
-            dataframe["messages"] = dataframe["messages"].apply(lambda x: x[:-1])
         else:
             settings = rg.Settings(
                 fields=[
                     rg.TextField(
                         name="prompt",
                         description="The prompt used for the conversation",
@@ -317,13 +322,10 @@ def push_to_argilla(
                     ),
                 ],
                 questions=[
-                    rg.TextQuestion(
-                        name="correct_prompt",
-                        description="The corrected prompt from the assistant",
-                    ),
-                    rg.TextQuestion(
-                        name="correct_completion",
-                        description="The corrected completion from the assistant",
                     ),
                 ],
                 metadata=[
@@ -342,22 +344,20 @@ def push_to_argilla(
                 ],
                 guidelines="Please review the conversation and correct the prompt and completion where needed.",
             )
-            dataframe["correct_prompt"] = dataframe["prompt"]
-            dataframe["correct_completion"] = dataframe["completion"]
             dataframe["prompt_length"] = dataframe["prompt"].apply(len)
             dataframe["completion_length"] = dataframe["completion"].apply(len)
             dataframe["prompt_embeddings"] = get_embeddings(dataframe["prompt"])
         progress(0.5, desc="Creating dataset")
-        if client.datasets(name=dataset_name, workspace=rg_user.username) is not None:
-            raise gr.Error(f"Dataset {dataset_name} already exists")
-        rg_dataset = rg.Dataset(
-            name=dataset_name,
-            workspace=rg_user.username,
-            settings=settings,
-            client=client,
-        )
-        rg_dataset = rg_dataset.create()
         progress(0.7, desc="Pushing dataset to Argilla")
         hf_dataset = Dataset.from_pandas(dataframe)
         rg_dataset.records.log(records=hf_dataset)
@@ -367,6 +367,23 @@ def push_to_argilla(
     return original_dataframe
 def upload_pipeline_code(
     pipeline_code,
     org_name,
@@ -469,7 +486,7 @@ with gr.Blocks(
         # Add a header for the full dataset generation section
         gr.Markdown("## Generate full dataset")
         gr.Markdown(
-            "Once you're satisfied with the sample, generate a larger dataset and push it to the Hub."
         )
         with gr.Column() as push_to_hub_ui:
@@ -489,22 +506,31 @@ with gr.Blocks(
                     maximum=500,
                     info="The number of rows in the dataset. Note that you are able to generate more rows at once but that this will take time.",
                 )
             with gr.Tab(label="Argilla"):
-                with gr.Row(variant="panel"):
-                    dataset_name = gr.Textbox(
-                        label="Dataset name",
-                        placeholder="dataset_name",
-                        value="my-distiset",
-                    )
-                with gr.Row(variant="panel"):
-                    btn_generate_full_dataset_copy = gr.Button(
-                        value="Generate", variant="primary", scale=2
-                    )
-                    btn_generate_and_push_to_argilla = gr.Button(
-                        value="Generate and Push to Argilla", variant="primary", scale=2
-                    )
-                    btn_push_to_argilla = gr.Button(
-                        value="Push to Argilla", variant="primary", scale=2
                     )
             with gr.Tab("Hugging Face Hub"):
                 with gr.Row(variant="panel"):
@@ -554,10 +580,10 @@ with gr.Blocks(
                     <a href="{argilla_api_url}" target="_blank" style="color: #1565c0; text-decoration: none;">
                         {argilla_api_url}
                     </a>
-                    Here are some docs to help you:
-                    • <a href="https://docs.argilla.io/latest/getting_started/quickstart/#sign-in-into-the-argilla-ui" target="_blank">Login with OAuth</a>
-                    • <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">Curate your data</a>
-                    • <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">Export your data</a>
                 </p>
             </div>
             """,
@@ -621,14 +647,19 @@ with gr.Blocks(
     )
     btn_generate_and_push_to_argilla.click(
         fn=hide_success_message,
         outputs=[success_message],
-    ).then(
         fn=generate_dataset,
         inputs=[system_prompt, num_turns, num_rows],
         outputs=[final_dataset],
         show_progress=True,
-    ).then(
         fn=push_to_argilla,
         inputs=[final_dataset, dataset_name],
         outputs=[final_dataset],
@@ -685,7 +716,12 @@ with gr.Blocks(
     btn_push_to_argilla.click(
         fn=hide_success_message,
         outputs=[success_message],
-    ).then(
         fn=push_to_argilla,
         inputs=[final_dataset, dataset_name],
         outputs=[final_dataset],

                     ),
                 ],
                 questions=[
+                    rg.RatingQuestion(
+                        name="rating",
+                        description="The rating of the conversation",
+                        values=list(range(1, 6)),
                     ),
                 ],
                 metadata=[
                     rg.IntegerMetadataProperty(
+                        name="user_message_length", title="User Message Length"
                     ),
                     rg.IntegerMetadataProperty(
+                        name="assistant_message_length",
+                        title="Assistant Message Length",
                     ),
                 ],
                 vectors=[
                 ],
                 guidelines="Please review the conversation and provide a score for the assistant's response.",
             )
+            dataframe["user_message_length"] = dataframe["messages"].apply(
+                lambda x: sum([len(y["content"]) for y in x if y["role"] == "user"])
+            )
+            dataframe["assistant_message_length"] = dataframe["messages"].apply(
+                lambda x: sum(
+                    [len(y["content"]) for y in x if y["role"] == "assistant"]
+                )
             )
             dataframe["messages_embeddings"] = get_embeddings(
                 dataframe["messages"].apply(
                     lambda x: " ".join([y["content"] for y in x])
                 )
             )
         else:
             settings = rg.Settings(
                 fields=[
+                    rg.TextField(
+                        name="system_prompt",
+                        description="The system prompt used for the conversation",
+                        required=False,
+                    ),
                     rg.TextField(
                         name="prompt",
                         description="The prompt used for the conversation",
                     ),
                 ],
                 questions=[
+                    rg.RatingQuestion(
+                        name="rating",
+                        description="The rating of the conversation",
+                        values=list(range(1, 6)),
                     ),
                 ],
                 metadata=[
                 ],
                 guidelines="Please review the conversation and correct the prompt and completion where needed.",
             )
             dataframe["prompt_length"] = dataframe["prompt"].apply(len)
             dataframe["completion_length"] = dataframe["completion"].apply(len)
             dataframe["prompt_embeddings"] = get_embeddings(dataframe["prompt"])
         progress(0.5, desc="Creating dataset")
+        rg_dataset = client.datasets(name=dataset_name, workspace=rg_user.username)
+        if rg_dataset is None:
+            rg_dataset = rg.Dataset(
+                name=dataset_name,
+                workspace=rg_user.username,
+                settings=settings,
+                client=client,
+            )
+            rg_dataset = rg_dataset.create()
         progress(0.7, desc="Pushing dataset to Argilla")
         hf_dataset = Dataset.from_pandas(dataframe)
         rg_dataset.records.log(records=hf_dataset)
     return original_dataframe
+def validate_argilla_dataset_name(
+    dataset_name: str,
+    final_dataset: pd.DataFrame,
+    oauth_token: Union[OAuthToken, None] = None,
+    progress=gr.Progress(),
+) -> str:
+    progress(0, desc="Validating dataset configuration")
+    hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+    client = get_argilla_client()
+    if dataset_name is None or dataset_name == "":
+        raise gr.Error("Dataset name is required")
+    dataset = client.datasets(name=dataset_name, workspace=hf_user)
+    if dataset:
+        raise gr.Error(f"Dataset {dataset_name} already exists")
+    return final_dataset
 def upload_pipeline_code(
     pipeline_code,
     org_name,
         # Add a header for the full dataset generation section
         gr.Markdown("## Generate full dataset")
         gr.Markdown(
+            "Once you're satisfied with the sample, generate a larger dataset and push it to Argilla or the Hugging Face Hub."
         )
         with gr.Column() as push_to_hub_ui:
                     maximum=500,
                     info="The number of rows in the dataset. Note that you are able to generate more rows at once but that this will take time.",
                 )
             with gr.Tab(label="Argilla"):
+                if get_argilla_client():
+                    with gr.Row(variant="panel"):
+                        dataset_name = gr.Textbox(
+                            label="Dataset name",
+                            placeholder="dataset_name",
+                            value="my-distiset",
+                        )
+                    with gr.Row(variant="panel"):
+                        btn_generate_full_dataset_copy = gr.Button(
+                            value="Generate", variant="primary", scale=2
+                        )
+                        btn_generate_and_push_to_argilla = gr.Button(
+                            value="Generate and Push to Argilla",
+                            variant="primary",
+                            scale=2,
+                        )
+                        btn_push_to_argilla = gr.Button(
+                            value="Push to Argilla", variant="primary", scale=2
+                        )
+                else:
+                    gr.Markdown(
+                        "Please add `ARGILLA_API_URL` and `ARGILLA_API_KEY` to use Argilla."
                     )
             with gr.Tab("Hugging Face Hub"):
                 with gr.Row(variant="panel"):
                     <a href="{argilla_api_url}" target="_blank" style="color: #1565c0; text-decoration: none;">
                         {argilla_api_url}
                     </a>
+                    <br>Unfamiliar with Argilla? Here are some docs to help you get started:
+                    <br>• <a href="https://docs.argilla.io/latest/getting_started/quickstart/#sign-in-into-the-argilla-ui" target="_blank">Login with OAuth</a>
+                    <br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">Curate your data</a>
+                    <br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">Export your data</a>
                 </p>
             </div>
             """,
     )
     btn_generate_and_push_to_argilla.click(
+        fn=validate_argilla_dataset_name,
+        inputs=[dataset_name, final_dataset],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
         fn=hide_success_message,
         outputs=[success_message],
+    ).success(
         fn=generate_dataset,
         inputs=[system_prompt, num_turns, num_rows],
         outputs=[final_dataset],
         show_progress=True,
+    ).success(
         fn=push_to_argilla,
         inputs=[final_dataset, dataset_name],
         outputs=[final_dataset],
     btn_push_to_argilla.click(
         fn=hide_success_message,
         outputs=[success_message],
+    ).success(
+        fn=validate_argilla_dataset_name,
+        inputs=[dataset_name, final_dataset],
+        outputs=[final_dataset],
+        show_progress=True,
+    ).success(
         fn=push_to_argilla,
         inputs=[final_dataset, dataset_name],
         outputs=[final_dataset],

src/distilabel_dataset_generator/pipelines/sft.py CHANGED Viewed

@@ -189,7 +189,7 @@ with Pipeline(name="sft") as pipeline:
             tokenizer_id=MODEL,
             magpie_pre_query_template="llama3",
             generation_kwargs={{
-                "temperature": 0.8,
                 "do_sample": True,
                 "max_new_tokens": 2048,
                 "stop_sequences": {_STOP_SEQUENCES}
@@ -231,7 +231,7 @@ def get_magpie_generator(num_turns, num_rows, system_prompt, is_sample):
                 api_key=_get_next_api_key(),
                 magpie_pre_query_template="llama3",
                 generation_kwargs={
-                    "temperature": 0.8,
                     "do_sample": True,
                     "max_new_tokens": 256 if is_sample else 512,
                     "stop_sequences": _STOP_SEQUENCES,
@@ -250,7 +250,7 @@ def get_magpie_generator(num_turns, num_rows, system_prompt, is_sample):
                 api_key=_get_next_api_key(),
                 magpie_pre_query_template="llama3",
                 generation_kwargs={
-                    "temperature": 0.8,
                     "do_sample": True,
                     "max_new_tokens": 256 if is_sample else 1024,
                     "stop_sequences": _STOP_SEQUENCES,

             tokenizer_id=MODEL,
             magpie_pre_query_template="llama3",
             generation_kwargs={{
+                "temperature": 1,
                 "do_sample": True,
                 "max_new_tokens": 2048,
                 "stop_sequences": {_STOP_SEQUENCES}
                 api_key=_get_next_api_key(),
                 magpie_pre_query_template="llama3",
                 generation_kwargs={
+                    "temperature": 1,
                     "do_sample": True,
                     "max_new_tokens": 256 if is_sample else 512,
                     "stop_sequences": _STOP_SEQUENCES,
                 api_key=_get_next_api_key(),
                 magpie_pre_query_template="llama3",
                 generation_kwargs={
+                    "temperature": 1,
                     "do_sample": True,
                     "max_new_tokens": 256 if is_sample else 1024,
                     "stop_sequences": _STOP_SEQUENCES,

src/distilabel_dataset_generator/utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import argilla as rg
 import gradio as gr
@@ -84,10 +85,13 @@ def swap_visibilty(oauth_token: OAuthToken = None):
         return gr.update(elem_classes=["main_ui_logged_out"])
-def get_argilla_client():
-    return rg.Argilla(
-        api_url=os.getenv("ARGILLA_API_URL_SDG_REVIEWER")
-        or os.getenv("ARGILLA_API_URL"),
-        api_key=os.getenv("ARGILLA_API_KEY_SDG_REVIEWER")
-        or os.getenv("ARGILLA_API_KEY"),
-    )

 import os
+from typing import Union
 import argilla as rg
 import gradio as gr
         return gr.update(elem_classes=["main_ui_logged_out"])
+def get_argilla_client() -> Union[rg.Argilla, None]:
+    try:
+        return rg.Argilla(
+            api_url=os.getenv("ARGILLA_API_URL_SDG_REVIEWER")
+            or os.getenv("ARGILLA_API_URL"),
+            api_key=os.getenv("ARGILLA_API_KEY_SDG_REVIEWER")
+            or os.getenv("ARGILLA_API_KEY"),
+        )
+    except Exception:
+        return None