Commit
·
a13f86c
1
Parent(s):
291ad35
fix: stop sequences for textgen and add examples to pipeline definiton
Browse files
src/distilabel_dataset_generator/apps/sft.py
CHANGED
|
@@ -89,14 +89,18 @@ def generate_dataset(
|
|
| 89 |
"You can only generate a dataset with 5000 or fewer rows. Setting to 5000."
|
| 90 |
)
|
| 91 |
|
| 92 |
-
if num_rows <
|
| 93 |
duration = 60
|
| 94 |
-
elif num_rows <
|
| 95 |
-
duration =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
elif num_rows < 1000:
|
| 97 |
-
duration =
|
| 98 |
else:
|
| 99 |
-
duration =
|
| 100 |
|
| 101 |
result_queue = multiprocessing.Queue()
|
| 102 |
p = multiprocessing.Process(
|
|
@@ -127,7 +131,7 @@ def generate_dataset(
|
|
| 127 |
repo_id=repo_id,
|
| 128 |
private=private,
|
| 129 |
include_script=False,
|
| 130 |
-
token=oauth_token
|
| 131 |
)
|
| 132 |
|
| 133 |
# If not pushing to hub generate the dataset directly
|
|
|
|
| 89 |
"You can only generate a dataset with 5000 or fewer rows. Setting to 5000."
|
| 90 |
)
|
| 91 |
|
| 92 |
+
if num_rows < 10:
|
| 93 |
duration = 60
|
| 94 |
+
elif num_rows < 30:
|
| 95 |
+
duration = 120
|
| 96 |
+
elif num_rows < 100:
|
| 97 |
+
duration = 240
|
| 98 |
+
elif num_rows < 300:
|
| 99 |
+
duration = 600
|
| 100 |
elif num_rows < 1000:
|
| 101 |
+
duration = 1200
|
| 102 |
else:
|
| 103 |
+
duration = 2400
|
| 104 |
|
| 105 |
result_queue = multiprocessing.Queue()
|
| 106 |
p = multiprocessing.Process(
|
|
|
|
| 131 |
repo_id=repo_id,
|
| 132 |
private=private,
|
| 133 |
include_script=False,
|
| 134 |
+
token=oauth_token,
|
| 135 |
)
|
| 136 |
|
| 137 |
# If not pushing to hub generate the dataset directly
|
src/distilabel_dataset_generator/pipelines/sft.py
CHANGED
|
@@ -227,7 +227,6 @@ def get_prompt_generation_step():
|
|
| 227 |
"temperature": 0.8,
|
| 228 |
"max_new_tokens": 2048,
|
| 229 |
"do_sample": True,
|
| 230 |
-
"stop_sequences": _STOP_SEQUENCES,
|
| 231 |
},
|
| 232 |
),
|
| 233 |
use_system_prompt=True,
|
|
@@ -243,7 +242,7 @@ if __name__ == "__main__":
|
|
| 243 |
[
|
| 244 |
{
|
| 245 |
"system_prompt": PROMPT_CREATION_PROMPT,
|
| 246 |
-
"instruction":
|
| 247 |
}
|
| 248 |
]
|
| 249 |
)
|
|
|
|
| 227 |
"temperature": 0.8,
|
| 228 |
"max_new_tokens": 2048,
|
| 229 |
"do_sample": True,
|
|
|
|
| 230 |
},
|
| 231 |
),
|
| 232 |
use_system_prompt=True,
|
|
|
|
| 242 |
[
|
| 243 |
{
|
| 244 |
"system_prompt": PROMPT_CREATION_PROMPT,
|
| 245 |
+
"instruction": DEFAULT_DATASET_DESCRIPTIONS[0],
|
| 246 |
}
|
| 247 |
]
|
| 248 |
)
|