Spaces:
Runtime error
Runtime error
Commit
ยท
a69bbb8
1
Parent(s):
1df21c4
feat: add support for file uploads
Browse files
src/distilabel_dataset_generator/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ from distilabel.utils.card.dataset_card import (
|
|
| 7 |
DistilabelDatasetCard,
|
| 8 |
size_categories_parser,
|
| 9 |
)
|
| 10 |
-
from huggingface_hub import DatasetCardData, HfApi
|
| 11 |
|
| 12 |
|
| 13 |
class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
|
|
|
|
| 7 |
DistilabelDatasetCard,
|
| 8 |
size_categories_parser,
|
| 9 |
)
|
| 10 |
+
from huggingface_hub import DatasetCardData, HfApi, upload_file
|
| 11 |
|
| 12 |
|
| 13 |
class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
|
src/distilabel_dataset_generator/apps/sft.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
|
|
| 1 |
import multiprocessing
|
| 2 |
import time
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
import pandas as pd
|
| 6 |
from distilabel.distiset import Distiset
|
|
|
|
| 7 |
|
| 8 |
from src.distilabel_dataset_generator.pipelines.sft import (
|
| 9 |
DEFAULT_DATASET_DESCRIPTIONS,
|
|
@@ -140,7 +142,7 @@ def generate_dataset(
|
|
| 140 |
distiset.push_to_hub(
|
| 141 |
repo_id=repo_id,
|
| 142 |
private=private,
|
| 143 |
-
include_script=
|
| 144 |
token=oauth_token,
|
| 145 |
)
|
| 146 |
|
|
@@ -155,6 +157,18 @@ def generate_dataset(
|
|
| 155 |
return pd.DataFrame(outputs)
|
| 156 |
|
| 157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
css = """
|
| 159 |
.main_ui_logged_out{opacity: 0.3; pointer-events: none}
|
| 160 |
"""
|
|
@@ -169,9 +183,9 @@ with gr.Blocks(
|
|
| 169 |
"To push the dataset to the Hugging Face Hub you need to sign in. This will only be used for pushing the dataset not for data generation."
|
| 170 |
)
|
| 171 |
with gr.Row():
|
| 172 |
-
gr.Column(
|
| 173 |
get_login_button()
|
| 174 |
-
gr.Column(
|
| 175 |
|
| 176 |
gr.Markdown("## Iterate on a sample dataset")
|
| 177 |
with gr.Column() as main_ui:
|
|
@@ -304,6 +318,17 @@ with gr.Blocks(
|
|
| 304 |
def hide_success_message():
|
| 305 |
return gr.Markdown(visible=False)
|
| 306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
sample_dataset.change(
|
| 308 |
fn=lambda x: x,
|
| 309 |
inputs=[sample_dataset],
|
|
@@ -326,23 +351,16 @@ with gr.Blocks(
|
|
| 326 |
],
|
| 327 |
outputs=[final_dataset],
|
| 328 |
show_progress=True,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
).success(
|
| 330 |
fn=show_success_message,
|
| 331 |
inputs=[org_name, repo_name],
|
| 332 |
outputs=[success_message],
|
| 333 |
)
|
| 334 |
|
| 335 |
-
gr.Markdown("## Or run this pipeline locally with distilabel")
|
| 336 |
-
|
| 337 |
-
with gr.Accordion("Run this pipeline using distilabel", open=False):
|
| 338 |
-
pipeline_code = gr.Code(
|
| 339 |
-
value=generate_pipeline_code(
|
| 340 |
-
system_prompt.value, num_turns.value, num_rows.value
|
| 341 |
-
),
|
| 342 |
-
language="python",
|
| 343 |
-
label="Distilabel Pipeline Code",
|
| 344 |
-
)
|
| 345 |
-
|
| 346 |
system_prompt.change(
|
| 347 |
fn=generate_pipeline_code,
|
| 348 |
inputs=[system_prompt, num_turns, num_rows],
|
|
|
|
| 1 |
+
import io
|
| 2 |
import multiprocessing
|
| 3 |
import time
|
| 4 |
|
| 5 |
import gradio as gr
|
| 6 |
import pandas as pd
|
| 7 |
from distilabel.distiset import Distiset
|
| 8 |
+
from huggingface_hub import upload_file
|
| 9 |
|
| 10 |
from src.distilabel_dataset_generator.pipelines.sft import (
|
| 11 |
DEFAULT_DATASET_DESCRIPTIONS,
|
|
|
|
| 142 |
distiset.push_to_hub(
|
| 143 |
repo_id=repo_id,
|
| 144 |
private=private,
|
| 145 |
+
include_script=True,
|
| 146 |
token=oauth_token,
|
| 147 |
)
|
| 148 |
|
|
|
|
| 157 |
return pd.DataFrame(outputs)
|
| 158 |
|
| 159 |
|
| 160 |
+
def upload_pipeline_code(pipeline_code, org_name, repo_name, oauth_token):
|
| 161 |
+
with io.BytesIO(pipeline_code.encode("utf-8")) as f:
|
| 162 |
+
upload_file(
|
| 163 |
+
path_or_fileobj=f,
|
| 164 |
+
path_in_repo="pipeline.py",
|
| 165 |
+
repo_id=f"{org_name}/{repo_name}",
|
| 166 |
+
repo_type="dataset",
|
| 167 |
+
token=oauth_token,
|
| 168 |
+
commit_message="Include pipeline script",
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
|
| 172 |
css = """
|
| 173 |
.main_ui_logged_out{opacity: 0.3; pointer-events: none}
|
| 174 |
"""
|
|
|
|
| 183 |
"To push the dataset to the Hugging Face Hub you need to sign in. This will only be used for pushing the dataset not for data generation."
|
| 184 |
)
|
| 185 |
with gr.Row():
|
| 186 |
+
gr.Column()
|
| 187 |
get_login_button()
|
| 188 |
+
gr.Column()
|
| 189 |
|
| 190 |
gr.Markdown("## Iterate on a sample dataset")
|
| 191 |
with gr.Column() as main_ui:
|
|
|
|
| 318 |
def hide_success_message():
|
| 319 |
return gr.Markdown(visible=False)
|
| 320 |
|
| 321 |
+
gr.Markdown("## Or run this pipeline locally with distilabel")
|
| 322 |
+
|
| 323 |
+
with gr.Accordion("Run this pipeline using distilabel", open=False):
|
| 324 |
+
pipeline_code = gr.Code(
|
| 325 |
+
value=generate_pipeline_code(
|
| 326 |
+
system_prompt.value, num_turns.value, num_rows.value
|
| 327 |
+
),
|
| 328 |
+
language="python",
|
| 329 |
+
label="Distilabel Pipeline Code",
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
sample_dataset.change(
|
| 333 |
fn=lambda x: x,
|
| 334 |
inputs=[sample_dataset],
|
|
|
|
| 351 |
],
|
| 352 |
outputs=[final_dataset],
|
| 353 |
show_progress=True,
|
| 354 |
+
).then(
|
| 355 |
+
fn=upload_pipeline_code,
|
| 356 |
+
inputs=[pipeline_code, org_name, repo_name, oauth_token],
|
| 357 |
+
outputs=[],
|
| 358 |
).success(
|
| 359 |
fn=show_success_message,
|
| 360 |
inputs=[org_name, repo_name],
|
| 361 |
outputs=[success_message],
|
| 362 |
)
|
| 363 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
system_prompt.change(
|
| 365 |
fn=generate_pipeline_code,
|
| 366 |
inputs=[system_prompt, num_turns, num_rows],
|
src/distilabel_dataset_generator/utils.py
CHANGED
|
@@ -33,8 +33,7 @@ else:
|
|
| 33 |
|
| 34 |
def get_login_button():
|
| 35 |
return gr.LoginButton(
|
| 36 |
-
value="Sign in with Hugging Face!",
|
| 37 |
-
size="lg",
|
| 38 |
).activate()
|
| 39 |
|
| 40 |
|
|
|
|
| 33 |
|
| 34 |
def get_login_button():
|
| 35 |
return gr.LoginButton(
|
| 36 |
+
value="Sign in with Hugging Face!", size="lg", scale=2
|
|
|
|
| 37 |
).activate()
|
| 38 |
|
| 39 |
|