synthetic-data-generator

Build error

App Files Files Community

davidberenstein1957 commited on Jan 10

Commit

371c76b

1 Parent(s): 3c6a88c

Update version to 0.1.6, remove requirements.txt, and enhance dataset handling in pipelines. Added Gradio support and improved LLM class retrieval. Commented out HF_TOKEN in example deployment script.

Browse files

Files changed (6) hide show

pyproject.toml +1 -1
requirements.txt +0 -1
src/synthetic_dataset_generator/_distiset.py +16 -6
src/synthetic_dataset_generator/pipelines/base.py +18 -4
src/synthetic_dataset_generator/pipelines/chat.py +3 -20
src/synthetic_dataset_generator/pipelines/textcat.py +4 -32

pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "synthetic-dataset-generator"
-version = "0.1.5"
 description = "Build datasets using natural language"
 authors = [
     {name = "davidberenstein1957", email = "[email protected]"},

 [project]
 name = "synthetic-dataset-generator"
+version = "0.1.6"
 description = "Build datasets using natural language"
 authors = [
     {name = "davidberenstein1957", email = "[email protected]"},

requirements.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- -e git+https://github.com/argilla-io/synthetic-data-generator.git#egg=synthetic-dataset-generator

src/synthetic_dataset_generator/_distiset.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import Optional
 import distilabel
 import distilabel.distiset
 from distilabel.utils.card.dataset_card import (
     DistilabelDatasetCard,
     size_categories_parser,
@@ -81,14 +82,23 @@ class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
                 dataset[0] if not isinstance(dataset, dict) else dataset["train"][0]
             )
-        keys = list(sample_records.keys())
-        if len(keys) != 2 or not (
-            ("label" in keys and "text" in keys)
-            or ("labels" in keys and "text" in keys)
         ):
             task_categories = ["text-classification"]
-        elif "prompt" in keys or "messages" in keys:
-            task_categories = ["text-generation", "text2text-generation"]
         readme_metadata = {}
         if repo_id and token:

 import distilabel
 import distilabel.distiset
+import gradio as gr
 from distilabel.utils.card.dataset_card import (
     DistilabelDatasetCard,
     size_categories_parser,
                 dataset[0] if not isinstance(dataset, dict) else dataset["train"][0]
             )
+        columns = self["default"].column_names
+        columns = self["default"].column_names
+        if ("label" in columns and "text" in columns) or (
+            "labels" in columns and "text" in columns
         ):
             task_categories = ["text-classification"]
+        elif ("prompt" in columns and "completion" in columns) or (
+            "messages" in columns
+        ):
+            task_categories: list[str] = ["text-generation", "text2text-generation"]
+        else:
+            task_categories: list[str] = []
+            gr.Info(
+                f"No task categories found for dataset with columns: {columns}. "
+                "Please notify the distilabel team if you think this is an error."
+            )
         readme_metadata = {}
         if repo_id and token:

src/synthetic_dataset_generator/pipelines/base.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import math
 import random
-import gradio as gr
 from distilabel.llms import ClientvLLM, InferenceEndpointsLLM, OllamaLLM, OpenAILLM
 from distilabel.steps.tasks import TextGeneration
@@ -9,7 +8,6 @@ from synthetic_dataset_generator.constants import (
     API_KEYS,
     DEFAULT_BATCH_SIZE,
     HUGGINGFACE_BASE_URL,
-    MAGPIE_PRE_QUERY_TEMPLATE,
     MODEL,
     OLLAMA_BASE_URL,
     OPENAI_BASE_URL,
@@ -62,6 +60,19 @@ def get_rewriten_prompts(prompt: str, num_rows: int):
     return prompt_rewrites
 def _get_llm(use_magpie_template=False, **kwargs):
     if OPENAI_BASE_URL:
         llm = OpenAILLM(
@@ -100,6 +111,7 @@ def _get_llm(use_magpie_template=False, **kwargs):
             model=MODEL,
             host=OLLAMA_BASE_URL,
             tokenizer_id=TOKENIZER_ID or MODEL,
             **kwargs,
         )
     elif HUGGINGFACE_BASE_URL:
@@ -108,6 +120,7 @@ def _get_llm(use_magpie_template=False, **kwargs):
             api_key=_get_next_api_key(),
             base_url=HUGGINGFACE_BASE_URL,
             tokenizer_id=TOKENIZER_ID or MODEL,
             **kwargs,
         )
     elif VLLM_BASE_URL:
@@ -119,6 +132,7 @@ def _get_llm(use_magpie_template=False, **kwargs):
             model=MODEL,
             tokenizer=TOKENIZER_ID or MODEL,
             api_key=_get_next_api_key(),
             **kwargs,
         )
     else:
@@ -126,7 +140,7 @@ def _get_llm(use_magpie_template=False, **kwargs):
             api_key=_get_next_api_key(),
             tokenizer_id=TOKENIZER_ID or MODEL,
             model_id=MODEL,
-            magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE,
             **kwargs,
         )
@@ -138,4 +152,4 @@ try:
     llm.load()
     llm.generate([[{"content": "Hello, world!", "role": "user"}]])
 except Exception as e:
-    gr.Error(f"Error loading {llm.__class__.__name__}: {e}")

 import math
 import random
 from distilabel.llms import ClientvLLM, InferenceEndpointsLLM, OllamaLLM, OpenAILLM
 from distilabel.steps.tasks import TextGeneration
     API_KEYS,
     DEFAULT_BATCH_SIZE,
     HUGGINGFACE_BASE_URL,
     MODEL,
     OLLAMA_BASE_URL,
     OPENAI_BASE_URL,
     return prompt_rewrites
+def _get_llm_class() -> str:
+    if OPENAI_BASE_URL:
+        return "OpenAILLM"
+    elif OLLAMA_BASE_URL:
+        return "OllamaLLM"
+    elif HUGGINGFACE_BASE_URL:
+        return "InferenceEndpointsLLM"
+    elif VLLM_BASE_URL:
+        return "ClientvLLM"
+    else:
+        return "InferenceEndpointsLLM"
 def _get_llm(use_magpie_template=False, **kwargs):
     if OPENAI_BASE_URL:
         llm = OpenAILLM(
             model=MODEL,
             host=OLLAMA_BASE_URL,
             tokenizer_id=TOKENIZER_ID or MODEL,
+            use_magpie_template=use_magpie_template,
             **kwargs,
         )
     elif HUGGINGFACE_BASE_URL:
             api_key=_get_next_api_key(),
             base_url=HUGGINGFACE_BASE_URL,
             tokenizer_id=TOKENIZER_ID or MODEL,
+            use_magpie_template=use_magpie_template,
             **kwargs,
         )
     elif VLLM_BASE_URL:
             model=MODEL,
             tokenizer=TOKENIZER_ID or MODEL,
             api_key=_get_next_api_key(),
+            use_magpie_template=use_magpie_template,
             **kwargs,
         )
     else:
             api_key=_get_next_api_key(),
             tokenizer_id=TOKENIZER_ID or MODEL,
             model_id=MODEL,
+            use_magpie_template=use_magpie_template,
             **kwargs,
         )
     llm.load()
     llm.generate([[{"content": "Hello, world!", "role": "user"}]])
 except Exception as e:
+    raise Exception(f"Error loading {llm.__class__.__name__}: {e}")

src/synthetic_dataset_generator/pipelines/chat.py CHANGED Viewed

@@ -1,12 +1,10 @@
 from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
 from synthetic_dataset_generator.constants import (
-    BASE_URL,
     MAGPIE_PRE_QUERY_TEMPLATE,
     MAX_NUM_TOKENS,
-    MODEL,
 )
-from synthetic_dataset_generator.pipelines.base import _get_llm
 INFORMATION_SEEKING_PROMPT = (
     "You are an AI assistant designed to provide accurate and concise information on a wide"
@@ -237,28 +235,13 @@ import os
 from distilabel.pipeline import Pipeline
 from distilabel.steps import KeepColumns
 from distilabel.steps.tasks import MagpieGenerator
-from distilabel.llms import InferenceEndpointsLLM
-MODEL = "{MODEL}"
-BASE_URL = "{BASE_URL}"
 SYSTEM_PROMPT = "{system_prompt}"
-os.environ["API_KEY"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
 with Pipeline(name="sft") as pipeline:
     magpie = MagpieGenerator(
-        llm=InferenceEndpointsLLM(
-            model_id=MODEL,
-            tokenizer_id=MODEL,
-            base_url=BASE_URL,
-            magpie_pre_query_template="llama3",
-            generation_kwargs={{
-                "temperature": {temperature},
-                "do_sample": True,
-                "max_new_tokens": {MAX_NUM_TOKENS},
-                "stop_sequences": {_STOP_SEQUENCES}
-            }},
-            api_key=os.environ["API_KEY"],
-        ),
         n_turns={num_turns},
         num_rows={num_rows},
         batch_size=1,

 from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
 from synthetic_dataset_generator.constants import (
     MAGPIE_PRE_QUERY_TEMPLATE,
     MAX_NUM_TOKENS,
 )
+from synthetic_dataset_generator.pipelines.base import _get_llm, _get_llm_class
 INFORMATION_SEEKING_PROMPT = (
     "You are an AI assistant designed to provide accurate and concise information on a wide"
 from distilabel.pipeline import Pipeline
 from distilabel.steps import KeepColumns
 from distilabel.steps.tasks import MagpieGenerator
+from distilabel.llms import {_get_llm_class()}
 SYSTEM_PROMPT = "{system_prompt}"
 with Pipeline(name="sft") as pipeline:
     magpie = MagpieGenerator(
+        llm={_get_llm_class()}.from_json({_get_llm().model_dump_json()})},
         n_turns={num_turns},
         num_rows={num_rows},
         batch_size=1,

src/synthetic_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -9,11 +9,9 @@ from distilabel.steps.tasks import (
 from pydantic import BaseModel, Field
 from synthetic_dataset_generator.constants import (
-    BASE_URL,
     MAX_NUM_TOKENS,
-    MODEL,
 )
-from synthetic_dataset_generator.pipelines.base import _get_llm
 from synthetic_dataset_generator.utils import get_preprocess_labels
 PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
@@ -131,39 +129,21 @@ def generate_pipeline_code(
     temperature: float = 0.9,
 ) -> str:
     labels = get_preprocess_labels(labels)
-    MODEL_ARG = "model_id" if BASE_URL else "model"
-    MODEL_CLASS = "InferenceEndpointsLLM" if BASE_URL else "OpenAILLM"
     base_code = f"""
 # Requirements: `pip install distilabel[hf-inference-endpoints]`
 import os
 import random
-from distilabel.llms import InferenceEndpointsLLM
 from distilabel.pipeline import Pipeline
 from distilabel.steps import LoadDataFromDicts, KeepColumns
 from distilabel.steps.tasks import {"GenerateTextClassificationData" if num_labels == 1 else "GenerateTextClassificationData, TextClassification"}
-MODEL = "{MODEL}"
-BASE_URL = "{BASE_URL}"
-TEXT_CLASSIFICATION_TASK = "{system_prompt}"
-os.environ["API_KEY"] = (
-    "hf_xxx"  # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
-)
 with Pipeline(name="textcat") as pipeline:
     task_generator = LoadDataFromDicts(data=[{{"task": TEXT_CLASSIFICATION_TASK}}])
     textcat_generation = GenerateTextClassificationData(
-        llm={MODEL_CLASS}(
-            {MODEL_ARG}=MODEL,
-            base_url=BASE_URL,
-            api_key=os.environ["API_KEY"],
-            generation_kwargs={{
-                "temperature": {temperature},
-                "max_new_tokens": {MAX_NUM_TOKENS},
-                "top_p": 0.95,
-            }},
-        ),
         seed=random.randint(0, 2**32 - 1),
         difficulty={None if difficulty == "mixed" else repr(difficulty)},
         clarity={None if clarity == "mixed" else repr(clarity)},
@@ -196,15 +176,7 @@ with Pipeline(name="textcat") as pipeline:
     )
     textcat_labeller = TextClassification(
-        llm={MODEL_CLASS}(
-            {MODEL_ARG}=MODEL,
-            base_url=BASE_URL,
-            api_key=os.environ["API_KEY"],
-            generation_kwargs={{
-                "temperature": 0.8,
-                "max_new_tokens": {MAX_NUM_TOKENS},
-            }},
-        ),
         n={num_labels},
         available_labels={labels},
         context=TEXT_CLASSIFICATION_TASK,

 from pydantic import BaseModel, Field
 from synthetic_dataset_generator.constants import (
     MAX_NUM_TOKENS,
 )
+from synthetic_dataset_generator.pipelines.base import _get_llm, _get_llm_class
 from synthetic_dataset_generator.utils import get_preprocess_labels
 PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
     temperature: float = 0.9,
 ) -> str:
     labels = get_preprocess_labels(labels)
     base_code = f"""
 # Requirements: `pip install distilabel[hf-inference-endpoints]`
 import os
 import random
+from distilabel.llms import {_get_llm_class()}
 from distilabel.pipeline import Pipeline
 from distilabel.steps import LoadDataFromDicts, KeepColumns
 from distilabel.steps.tasks import {"GenerateTextClassificationData" if num_labels == 1 else "GenerateTextClassificationData, TextClassification"}
 with Pipeline(name="textcat") as pipeline:
     task_generator = LoadDataFromDicts(data=[{{"task": TEXT_CLASSIFICATION_TASK}}])
     textcat_generation = GenerateTextClassificationData(
+        llm={_get_llm_class()}.from_json({_get_llm().model_dump_json()}),
         seed=random.randint(0, 2**32 - 1),
         difficulty={None if difficulty == "mixed" else repr(difficulty)},
         clarity={None if clarity == "mixed" else repr(clarity)},
     )
     textcat_labeller = TextClassification(
+        llm={_get_llm_class()}.from_json({_get_llm().model_dump_json()}),
         n={num_labels},
         available_labels={labels},
         context=TEXT_CLASSIFICATION_TASK,