synthetic-data-generator

Build error

App Files Files Community

davidberenstein1957 commited on Dec 17, 2024

Commit

4106f96

1 Parent(s): b6646ba

fix openai compatability

Browse files

Files changed (5) hide show

README.md +6 -4
app.py +5 -0
src/synthetic_dataset_generator/apps/chat.py +3 -2
src/synthetic_dataset_generator/constants.py +9 -8
src/synthetic_dataset_generator/pipelines/textcat.py +66 -27

README.md CHANGED Viewed

@@ -82,13 +82,15 @@ Optionally, you can set the following environment variables to customize the gen
 - `MAX_NUM_ROWS`: The maximum number of rows to generate, defaults to `1000`.
 - `DEFAULT_BATCH_SIZE`: The default batch size to use for generating the dataset, defaults to `5`.
-Optionally, you can use different models and APIs.
-- `BASE_URL`: The base URL for any OpenAI compatible API, e.g. `https://api-inference.huggingface.co/v1/`, `https://api.openai.com/v1/`.
-- `MODEL`: The model to use for generating the dataset, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`, `gpt-4o`.
 - `API_KEY`: The API key to use for the generation API, e.g. `hf_...`, `sk-...`. If not provided, it will default to the provided `HF_TOKEN` environment variable.
-- `MAGPIE_PRE_QUERY_TEMPLATE`: Enforce setting the pre-query template for Magpie. Llama3 and Qwen2 are supported out of the box and will use `"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"` and `"<|im_start|>user\n"` respectively. For other models, you can pass a custom pre-query template string.
 Optionally, you can also push your datasets to Argilla for further curation by setting the following environment variables:

 - `MAX_NUM_ROWS`: The maximum number of rows to generate, defaults to `1000`.
 - `DEFAULT_BATCH_SIZE`: The default batch size to use for generating the dataset, defaults to `5`.
+Optionally, you can use different models and APIs. For providers outside of Hugging Face, we provide an integration through [LiteLLM](https://docs.litellm.ai/docs/providers).
+- `BASE_URL`: The base URL for any OpenAI compatible API, e.g. `https://api-inference.huggingface.co/v1/`, `https://api.openai.com/v1/`, `http://127.0.0.1:11434/v1/`.
+- `MODEL`: The model to use for generating the dataset, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`, `openai/gpt-4o`, `ollama/llama3.1`.
 - `API_KEY`: The API key to use for the generation API, e.g. `hf_...`, `sk-...`. If not provided, it will default to the provided `HF_TOKEN` environment variable.
+SFT and Chat Data generation is only supported with Hugging Face Inference Endpoints , and you can set the following environment variables use it with models other than Llama3 and Qwen2.
+- `MAGPIE_PRE_QUERY_TEMPLATE`: Enforce setting the pre-query template for Magpie, which is only supported with Hugging Face Inference Endpoints. Llama3 and Qwen2 are supported out of the box and will use `"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"` and `"<|im_start|>user\n"` respectively. For other models, you can pass a custom pre-query template string.
 Optionally, you can also push your datasets to Argilla for further curation by setting the following environment variables:

app.py CHANGED Viewed

@@ -1,3 +1,8 @@
 from synthetic_dataset_generator import launch
 launch()

+import os
 from synthetic_dataset_generator import launch
+os.environ["BASE_URL"] = "http://localhost:11434/v1"
+os.environ["MODEL"] = "llama3.1"
 launch()

src/synthetic_dataset_generator/apps/chat.py CHANGED Viewed

@@ -20,6 +20,7 @@ from synthetic_dataset_generator.apps.base import (
     validate_push_to_hub,
 )
 from synthetic_dataset_generator.constants import (
     DEFAULT_BATCH_SIZE,
     MODEL,
     SFT_AVAILABLE,
@@ -413,8 +414,8 @@ with gr.Blocks() as app:
                     [
                         "## Supervised Fine-Tuning not available",
                         "",
-                        f"This tool relies on the [Magpie](https://arxiv.org/abs/2406.08464) prequery template, which is not implemented for the {MODEL} model.",
-                        "Use Llama3 or Qwen2 models or [implement another magpie prequery template](https://github.com/argilla-io/distilabel/pull/778/files).",
                     ]
                 )
             )

     validate_push_to_hub,
 )
 from synthetic_dataset_generator.constants import (
+    BASE_URL,
     DEFAULT_BATCH_SIZE,
     MODEL,
     SFT_AVAILABLE,
                     [
                         "## Supervised Fine-Tuning not available",
                         "",
+                        f"This tool relies on the [Magpie](https://arxiv.org/abs/2406.08464) prequery template, which is not implemented for the {MODEL} with {BASE_URL}.",
+                        "Use Llama3 or Qwen2 models with Hugging Face Inference Endpoints.",
                     ]
                 )
             )

src/synthetic_dataset_generator/constants.py CHANGED Viewed

@@ -19,6 +19,8 @@ MAX_NUM_TOKENS = int(os.getenv("MAX_NUM_TOKENS", 2048))
 MAX_NUM_ROWS: str | int = int(os.getenv("MAX_NUM_ROWS", 1000))
 DEFAULT_BATCH_SIZE = int(os.getenv("DEFAULT_BATCH_SIZE", 5))
 MODEL = os.getenv("MODEL", "meta-llama/Meta-Llama-3.1-8B-Instruct")
 _API_KEY = os.getenv("API_KEY")
 if _API_KEY:
     API_KEYS = [_API_KEY]
@@ -27,12 +29,9 @@ else:
         os.getenv(f"HF_TOKEN_{i}") for i in range(1, 10)
     ]
 API_KEYS = [token for token in API_KEYS if token]
-BASE_URL = os.getenv("BASE_URL", "https://api-inference.huggingface.co/v1/")
-if BASE_URL != "https://api-inference.huggingface.co/v1/" and len(API_KEYS) == 0:
-    raise ValueError(
-        "API_KEY is not set. Ensure you have set the API_KEY environment variable that has access to the Hugging Face Inference Endpoints."
-    )
 llama_options = ["llama3", "llama-3", "llama 3"]
 qwen_options = ["qwen2", "qwen-2", "qwen 2"]
 if os.getenv("MAGPIE_PRE_QUERY_TEMPLATE"):
@@ -54,14 +53,16 @@ elif MODEL.lower() in qwen_options or any(
 ):
     SFT_AVAILABLE = True
     MAGPIE_PRE_QUERY_TEMPLATE = "qwen2"
-else:
     SFT_AVAILABLE = False
     warnings.warn(
-        "`SFT_AVAILABLE` is set to `False` because the model is not a Qwen or Llama model."
     )
     MAGPIE_PRE_QUERY_TEMPLATE = None
 # Embeddings
 STATIC_EMBEDDING_MODEL = "minishlab/potion-base-8M"

 MAX_NUM_ROWS: str | int = int(os.getenv("MAX_NUM_ROWS", 1000))
 DEFAULT_BATCH_SIZE = int(os.getenv("DEFAULT_BATCH_SIZE", 5))
 MODEL = os.getenv("MODEL", "meta-llama/Meta-Llama-3.1-8B-Instruct")
+BASE_URL = os.getenv("BASE_URL", default=None)
 _API_KEY = os.getenv("API_KEY")
 if _API_KEY:
     API_KEYS = [_API_KEY]
         os.getenv(f"HF_TOKEN_{i}") for i in range(1, 10)
     ]
 API_KEYS = [token for token in API_KEYS if token]
+# Determine if SFT is available
+SFT_AVAILABLE = False
 llama_options = ["llama3", "llama-3", "llama 3"]
 qwen_options = ["qwen2", "qwen-2", "qwen 2"]
 if os.getenv("MAGPIE_PRE_QUERY_TEMPLATE"):
 ):
     SFT_AVAILABLE = True
     MAGPIE_PRE_QUERY_TEMPLATE = "qwen2"
+if BASE_URL:
     SFT_AVAILABLE = False
+if not SFT_AVAILABLE:
     warnings.warn(
+        message="`SFT_AVAILABLE` is set to `False`. Use Hugging Face Inference Endpoints to generate chat data."
     )
     MAGPIE_PRE_QUERY_TEMPLATE = None
 # Embeddings
 STATIC_EMBEDDING_MODEL = "minishlab/potion-base-8M"

src/synthetic_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import random
 from typing import List
-from distilabel.llms import InferenceEndpointsLLM
 from distilabel.steps.tasks import (
     GenerateTextClassificationData,
     TextClassification,
@@ -61,39 +61,66 @@ class TextClassificationTask(BaseModel):
 def get_prompt_generator():
-    prompt_generator = TextGeneration(
-        llm=InferenceEndpointsLLM(
             api_key=_get_next_api_key(),
             model_id=MODEL,
             base_url=BASE_URL,
-            structured_output={"format": "json", "schema": TextClassificationTask},
-            generation_kwargs={
-                "temperature": 0.8,
-                "max_new_tokens": MAX_NUM_TOKENS,
-                "do_sample": True,
-            },
-        ),
         system_prompt=PROMPT_CREATION_PROMPT,
         use_system_prompt=True,
     )
     prompt_generator.load()
     return prompt_generator
 def get_textcat_generator(difficulty, clarity, temperature, is_sample):
-    textcat_generator = GenerateTextClassificationData(
-        llm=InferenceEndpointsLLM(
             model_id=MODEL,
             base_url=BASE_URL,
             api_key=_get_next_api_key(),
-            generation_kwargs={
-                "temperature": temperature,
-                "max_new_tokens": 256 if is_sample else MAX_NUM_TOKENS,
-                "do_sample": True,
-                "top_k": 50,
-                "top_p": 0.95,
-            },
-        ),
         difficulty=None if difficulty == "mixed" else difficulty,
         clarity=None if clarity == "mixed" else clarity,
         seed=random.randint(0, 2**32 - 1),
@@ -103,16 +130,28 @@ def get_textcat_generator(difficulty, clarity, temperature, is_sample):
 def get_labeller_generator(system_prompt, labels, multi_label):
-    labeller_generator = TextClassification(
-        llm=InferenceEndpointsLLM(
             model_id=MODEL,
             base_url=BASE_URL,
             api_key=_get_next_api_key(),
-            generation_kwargs={
-                "temperature": 0.7,
-                "max_new_tokens": MAX_NUM_TOKENS,
-            },
-        ),
         context=system_prompt,
         available_labels=labels,
         n=len(labels) if multi_label else 1,

 import random
 from typing import List
+from distilabel.llms import InferenceEndpointsLLM, OpenAILLM
 from distilabel.steps.tasks import (
     GenerateTextClassificationData,
     TextClassification,
 def get_prompt_generator():
+    structured_output = {
+        "format": "json",
+        "schema": TextClassificationTask,
+    }
+    generation_kwargs = {
+        "temperature": 0.8,
+        "max_new_tokens": MAX_NUM_TOKENS,
+    }
+    if BASE_URL:
+        llm = OpenAILLM(
+            model=MODEL,
+            base_url=BASE_URL,
+            api_key=_get_next_api_key(),
+            structured_output=structured_output,
+            generation_kwargs=generation_kwargs,
+        )
+    else:
+        generation_kwargs["do_sample"] = True
+        llm = InferenceEndpointsLLM(
             api_key=_get_next_api_key(),
             model_id=MODEL,
             base_url=BASE_URL,
+            structured_output=structured_output,
+            generation_kwargs=generation_kwargs,
+        )
+    prompt_generator = TextGeneration(
+        llm=llm,
         system_prompt=PROMPT_CREATION_PROMPT,
         use_system_prompt=True,
     )
     prompt_generator.load()
     return prompt_generator
 def get_textcat_generator(difficulty, clarity, temperature, is_sample):
+    generation_kwargs = {
+        "temperature": temperature,
+        "max_new_tokens": 256 if is_sample else MAX_NUM_TOKENS,
+        "top_p": 0.95,
+    }
+    if BASE_URL:
+        llm = OpenAILLM(
+            model=MODEL,
+            base_url=BASE_URL,
+            api_key=_get_next_api_key(),
+            generation_kwargs=generation_kwargs,
+        )
+    else:
+        generation_kwargs["do_sample"] = True
+        llm = InferenceEndpointsLLM(
             model_id=MODEL,
             base_url=BASE_URL,
             api_key=_get_next_api_key(),
+            generation_kwargs=generation_kwargs,
+        )
+    textcat_generator = GenerateTextClassificationData(
+        llm=llm,
         difficulty=None if difficulty == "mixed" else difficulty,
         clarity=None if clarity == "mixed" else clarity,
         seed=random.randint(0, 2**32 - 1),
 def get_labeller_generator(system_prompt, labels, multi_label):
+    generation_kwargs = {
+        "temperature": 0.01,
+        "max_new_tokens": MAX_NUM_TOKENS,
+    }
+    if BASE_URL:
+        llm = OpenAILLM(
+            model=MODEL,
+            base_url=BASE_URL,
+            api_key=_get_next_api_key(),
+            generation_kwargs=generation_kwargs,
+        )
+    else:
+        llm = InferenceEndpointsLLM(
             model_id=MODEL,
             base_url=BASE_URL,
             api_key=_get_next_api_key(),
+            generation_kwargs=generation_kwargs,
+        )
+    labeller_generator = TextClassification(
+        llm=llm,
         context=system_prompt,
         available_labels=labels,
         n=len(labels) if multi_label else 1,