Spaces:
Runtime error
Runtime error
Update main.py
Browse files
main.py
CHANGED
|
@@ -183,43 +183,6 @@ def main():
|
|
| 183 |
)
|
| 184 |
|
| 185 |
|
| 186 |
-
intro_text = P(
|
| 187 |
-
"Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects ",
|
| 188 |
-
A("Amber-7B", href="https://huggingface.co/LLM360/Amber"),
|
| 189 |
-
", ",
|
| 190 |
-
A("Crystal-7B", href="https://huggingface.co/LLM360/CrystalCoder"),
|
| 191 |
-
", ",
|
| 192 |
-
A("K2-65B", href="https://huggingface.co/LLM360/K2"),
|
| 193 |
-
" have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.",
|
| 194 |
-
)
|
| 195 |
-
|
| 196 |
-
intro_list = P(
|
| 197 |
-
"We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:"
|
| 198 |
-
)
|
| 199 |
-
|
| 200 |
-
intro_list1 = Ol(
|
| 201 |
-
Li(
|
| 202 |
-
"Curates commonly used pretraining datasets, including all CommonCrawl",
|
| 203 |
-
style="margin-bottom: 5px",
|
| 204 |
-
),
|
| 205 |
-
Li(
|
| 206 |
-
"Employs carefully selected filters designed for each data source",
|
| 207 |
-
style="margin-bottom: 5px",
|
| 208 |
-
),
|
| 209 |
-
Li(
|
| 210 |
-
"Provides only unique data elements via globally deduplicated across all datasets",
|
| 211 |
-
style="margin-bottom: 5px",
|
| 212 |
-
),
|
| 213 |
-
Li(
|
| 214 |
-
"Retains all deduplication metadata for custom upweighting",
|
| 215 |
-
style="margin-bottom: 5px",
|
| 216 |
-
),
|
| 217 |
-
Li(
|
| 218 |
-
"Is Production ready! Download here [link to HF repo]",
|
| 219 |
-
style="margin-bottom: 5px",
|
| 220 |
-
),
|
| 221 |
-
)
|
| 222 |
-
|
| 223 |
|
| 224 |
dataset_comparison1 = pd.DataFrame(
|
| 225 |
{
|
|
|
|
| 183 |
)
|
| 184 |
|
| 185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
dataset_comparison1 = pd.DataFrame(
|
| 188 |
{
|