Spaces:
Build error
Build error
Commit
·
e4b6cc5
1
Parent(s):
bff4352
feat: replace pipeline with individual generator
Browse files- app.py +1 -2
- src/distilabel_dataset_generator/sft.py +33 -12
app.py
CHANGED
|
@@ -9,5 +9,4 @@ demo = gr.TabbedInterface(
|
|
| 9 |
head="⚗️ Distilabel Dataset Generator",
|
| 10 |
)
|
| 11 |
|
| 12 |
-
|
| 13 |
-
demo.launch()
|
|
|
|
| 9 |
head="⚗️ Distilabel Dataset Generator",
|
| 10 |
)
|
| 11 |
|
| 12 |
+
demo.launch()
|
|
|
src/distilabel_dataset_generator/sft.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import gradio as gr
|
|
|
|
| 2 |
from distilabel.llms import InferenceEndpointsLLM
|
| 3 |
from distilabel.pipeline import Pipeline
|
| 4 |
from distilabel.steps.tasks import MagpieGenerator, TextGeneration
|
|
@@ -111,13 +112,17 @@ The prompt you write should follow the same style and structure as the following
|
|
| 111 |
User dataset description:
|
| 112 |
"""
|
| 113 |
|
| 114 |
-
MODEL = "meta-llama/Meta-Llama-3.1-
|
| 115 |
|
| 116 |
generate_description = TextGeneration(
|
| 117 |
llm=InferenceEndpointsLLM(
|
| 118 |
model_id=MODEL,
|
| 119 |
tokenizer_id=MODEL,
|
| 120 |
-
generation_kwargs={
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
),
|
| 122 |
use_system_prompt=True,
|
| 123 |
)
|
|
@@ -137,7 +142,7 @@ def _generate_system_prompt(_dataset_description):
|
|
| 137 |
)[0]["generation"]
|
| 138 |
|
| 139 |
|
| 140 |
-
def _generate_dataset(_system_prompt, _num_turns=1, _num_rows=
|
| 141 |
with Pipeline(name="sft") as pipeline:
|
| 142 |
magpie_step = MagpieGenerator(
|
| 143 |
llm=InferenceEndpointsLLM(
|
|
@@ -152,16 +157,28 @@ def _generate_dataset(_system_prompt, _num_turns=1, _num_rows=1):
|
|
| 152 |
num_rows=_num_rows,
|
| 153 |
system_prompt=_system_prompt,
|
| 154 |
)
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
|
| 160 |
with gr.Blocks(
|
| 161 |
-
title="⚗️ Distilabel Dataset Generator",
|
|
|
|
| 162 |
) as demo:
|
| 163 |
dataset_description = gr.Textbox(
|
| 164 |
-
label="Provide a description of the dataset",
|
|
|
|
| 165 |
)
|
| 166 |
|
| 167 |
btn_generate_system_prompt = gr.Button(
|
|
@@ -177,10 +194,10 @@ with gr.Blocks(
|
|
| 177 |
)
|
| 178 |
|
| 179 |
btn_generate_sample_dataset = gr.Button(
|
| 180 |
-
value="🧪 Generate Sample Dataset of
|
| 181 |
)
|
| 182 |
|
| 183 |
-
table = gr.Dataframe(label="Generated Dataset")
|
| 184 |
|
| 185 |
btn_generate_sample_dataset.click(
|
| 186 |
fn=_generate_dataset,
|
|
@@ -190,9 +207,13 @@ with gr.Blocks(
|
|
| 190 |
|
| 191 |
with gr.Row(variant="panel"):
|
| 192 |
with gr.Column():
|
| 193 |
-
num_turns = gr.Number(
|
|
|
|
|
|
|
| 194 |
with gr.Column():
|
| 195 |
-
num_rows = gr.Number(
|
|
|
|
|
|
|
| 196 |
|
| 197 |
dataset_name_push_to_hub = gr.Textbox(label="Dataset Name to push to Hub")
|
| 198 |
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
from distilabel.llms import InferenceEndpointsLLM
|
| 4 |
from distilabel.pipeline import Pipeline
|
| 5 |
from distilabel.steps.tasks import MagpieGenerator, TextGeneration
|
|
|
|
| 112 |
User dataset description:
|
| 113 |
"""
|
| 114 |
|
| 115 |
+
MODEL = "meta-llama/Meta-Llama-3.1-70B-Instruct"
|
| 116 |
|
| 117 |
generate_description = TextGeneration(
|
| 118 |
llm=InferenceEndpointsLLM(
|
| 119 |
model_id=MODEL,
|
| 120 |
tokenizer_id=MODEL,
|
| 121 |
+
generation_kwargs={
|
| 122 |
+
"temperature": 0.8,
|
| 123 |
+
"max_new_tokens": 2048,
|
| 124 |
+
"do_sample": True,
|
| 125 |
+
},
|
| 126 |
),
|
| 127 |
use_system_prompt=True,
|
| 128 |
)
|
|
|
|
| 142 |
)[0]["generation"]
|
| 143 |
|
| 144 |
|
| 145 |
+
def _generate_dataset(_system_prompt, _num_turns=1, _num_rows=5):
|
| 146 |
with Pipeline(name="sft") as pipeline:
|
| 147 |
magpie_step = MagpieGenerator(
|
| 148 |
llm=InferenceEndpointsLLM(
|
|
|
|
| 157 |
num_rows=_num_rows,
|
| 158 |
system_prompt=_system_prompt,
|
| 159 |
)
|
| 160 |
+
magpie_step.load()
|
| 161 |
+
if _num_turns == 1:
|
| 162 |
+
outputs = {"instruction": [], "response": []}
|
| 163 |
+
for _ in range(_num_rows):
|
| 164 |
+
entry = next(magpie_step.process())[0][0]
|
| 165 |
+
outputs["instruction"].append(entry["instruction"])
|
| 166 |
+
outputs["response"].append(entry["response"])
|
| 167 |
+
else:
|
| 168 |
+
outputs = {"conversation": []}
|
| 169 |
+
for _ in range(_num_rows):
|
| 170 |
+
entry = next(magpie_step.process())[0][0]
|
| 171 |
+
outputs["conversation"].append(entry["conversation"])
|
| 172 |
+
return pd.DataFrame(outputs)
|
| 173 |
|
| 174 |
|
| 175 |
with gr.Blocks(
|
| 176 |
+
title="⚗️ Distilabel Dataset Generator",
|
| 177 |
+
head="⚗️ Distilabel Dataset Generator",
|
| 178 |
) as demo:
|
| 179 |
dataset_description = gr.Textbox(
|
| 180 |
+
label="Provide a description of the dataset",
|
| 181 |
+
value="A chemistry dataset for an assistant that explains chemical reactions and formulas",
|
| 182 |
)
|
| 183 |
|
| 184 |
btn_generate_system_prompt = gr.Button(
|
|
|
|
| 194 |
)
|
| 195 |
|
| 196 |
btn_generate_sample_dataset = gr.Button(
|
| 197 |
+
value="🧪 Generate Sample Dataset of 5 rows and a single turn"
|
| 198 |
)
|
| 199 |
|
| 200 |
+
table = gr.Dataframe(label="Generated Dataset", wrap=True)
|
| 201 |
|
| 202 |
btn_generate_sample_dataset.click(
|
| 203 |
fn=_generate_dataset,
|
|
|
|
| 207 |
|
| 208 |
with gr.Row(variant="panel"):
|
| 209 |
with gr.Column():
|
| 210 |
+
num_turns = gr.Number(
|
| 211 |
+
value=1, label="Number of turns in the conversation", minimum=1
|
| 212 |
+
)
|
| 213 |
with gr.Column():
|
| 214 |
+
num_rows = gr.Number(
|
| 215 |
+
value=1, label="Number of rows in the dataset", minimum=1
|
| 216 |
+
)
|
| 217 |
|
| 218 |
dataset_name_push_to_hub = gr.Textbox(label="Dataset Name to push to Hub")
|
| 219 |
|