Update curated.py
Browse files- curated.py +15 -23
curated.py
CHANGED
|
@@ -78,7 +78,7 @@ wikipedia_filter = pd.DataFrame(
|
|
| 78 |
"",
|
| 79 |
],
|
| 80 |
"Total Percentage Remaining": [
|
| 81 |
-
"
|
| 82 |
],
|
| 83 |
}
|
| 84 |
)
|
|
@@ -107,7 +107,7 @@ freelaw_filter = pd.DataFrame(
|
|
| 107 |
"",
|
| 108 |
],
|
| 109 |
"Total Percentage Remaining": [
|
| 110 |
-
"
|
| 111 |
],
|
| 112 |
}
|
| 113 |
)
|
|
@@ -136,7 +136,7 @@ dmm_filter = pd.DataFrame(
|
|
| 136 |
"",
|
| 137 |
],
|
| 138 |
"Total Percentage Remaining": [
|
| 139 |
-
"
|
| 140 |
],
|
| 141 |
}
|
| 142 |
)
|
|
@@ -166,7 +166,7 @@ uspto_filter = pd.DataFrame(
|
|
| 166 |
"",
|
| 167 |
],
|
| 168 |
"Total Percentage Remaining": [
|
| 169 |
-
"
|
| 170 |
],
|
| 171 |
}
|
| 172 |
)
|
|
@@ -195,7 +195,7 @@ pg19_filter = pd.DataFrame(
|
|
| 195 |
"",
|
| 196 |
],
|
| 197 |
"Total Percentage Remaining": [
|
| 198 |
-
"
|
| 199 |
],
|
| 200 |
}
|
| 201 |
)
|
|
@@ -225,7 +225,7 @@ hn_filter = pd.DataFrame(
|
|
| 225 |
"",
|
| 226 |
],
|
| 227 |
"Total Percentage Remaining": [
|
| 228 |
-
"
|
| 229 |
],
|
| 230 |
}
|
| 231 |
)
|
|
@@ -255,7 +255,7 @@ uirc_filter = pd.DataFrame(
|
|
| 255 |
"",
|
| 256 |
],
|
| 257 |
"Total Percentage Remaining": [
|
| 258 |
-
"
|
| 259 |
],
|
| 260 |
}
|
| 261 |
)
|
|
@@ -284,7 +284,7 @@ up_filter = pd.DataFrame(
|
|
| 284 |
"",
|
| 285 |
],
|
| 286 |
"Total Percentage Remaining": [
|
| 287 |
-
"
|
| 288 |
],
|
| 289 |
}
|
| 290 |
)
|
|
@@ -313,7 +313,7 @@ se_filter = pd.DataFrame(
|
|
| 313 |
"",
|
| 314 |
],
|
| 315 |
"Total Percentage Remaining": [
|
| 316 |
-
"
|
| 317 |
],
|
| 318 |
}
|
| 319 |
)
|
|
@@ -342,7 +342,7 @@ arx_filter = pd.DataFrame(
|
|
| 342 |
"",
|
| 343 |
],
|
| 344 |
"Total Percentage Remaining": [
|
| 345 |
-
"
|
| 346 |
],
|
| 347 |
}
|
| 348 |
)
|
|
@@ -371,7 +371,7 @@ s2o_filter = pd.DataFrame(
|
|
| 371 |
"",
|
| 372 |
],
|
| 373 |
"Total Percentage Remaining": [
|
| 374 |
-
"
|
| 375 |
],
|
| 376 |
}
|
| 377 |
)
|
|
@@ -400,7 +400,7 @@ med_filter = pd.DataFrame(
|
|
| 400 |
"",
|
| 401 |
],
|
| 402 |
"Total Percentage Remaining": [
|
| 403 |
-
"
|
| 404 |
],
|
| 405 |
}
|
| 406 |
)
|
|
@@ -429,7 +429,7 @@ phil_filter = pd.DataFrame(
|
|
| 429 |
"",
|
| 430 |
],
|
| 431 |
"Total Percentage Remaining": [
|
| 432 |
-
"
|
| 433 |
],
|
| 434 |
}
|
| 435 |
)
|
|
@@ -445,8 +445,8 @@ filtering_process = Div(
|
|
| 445 |
H3("Wikipedia"),
|
| 446 |
H4("Download and Extraction"),
|
| 447 |
Ol(
|
| 448 |
-
Li("
|
| 449 |
-
Li("Data is originally in parqet format so we
|
| 450 |
),
|
| 451 |
H4("Filtering"),
|
| 452 |
Ol(
|
|
@@ -456,10 +456,6 @@ filtering_process = Div(
|
|
| 456 |
Ol(
|
| 457 |
Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
|
| 458 |
),
|
| 459 |
-
H4("Global Deduplication Process"),
|
| 460 |
-
Ol(
|
| 461 |
-
Li("After local dedup, remaining wikipedia was deduped again with all the datasets combined"),
|
| 462 |
-
),
|
| 463 |
table_div_wikipedia,
|
| 464 |
|
| 465 |
),
|
|
@@ -485,10 +481,6 @@ filtering_process = Div(
|
|
| 485 |
Ol(
|
| 486 |
Li("Local dedup was done with all papers combined."),
|
| 487 |
),
|
| 488 |
-
H4("Global Deduplication Process"),
|
| 489 |
-
Ol(
|
| 490 |
-
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
| 491 |
-
),
|
| 492 |
table_div_arx,
|
| 493 |
),
|
| 494 |
Section(
|
|
|
|
| 78 |
"",
|
| 79 |
],
|
| 80 |
"Total Percentage Remaining": [
|
| 81 |
+
"",
|
| 82 |
],
|
| 83 |
}
|
| 84 |
)
|
|
|
|
| 107 |
"",
|
| 108 |
],
|
| 109 |
"Total Percentage Remaining": [
|
| 110 |
+
"%",
|
| 111 |
],
|
| 112 |
}
|
| 113 |
)
|
|
|
|
| 136 |
"",
|
| 137 |
],
|
| 138 |
"Total Percentage Remaining": [
|
| 139 |
+
"%",
|
| 140 |
],
|
| 141 |
}
|
| 142 |
)
|
|
|
|
| 166 |
"",
|
| 167 |
],
|
| 168 |
"Total Percentage Remaining": [
|
| 169 |
+
"%",
|
| 170 |
],
|
| 171 |
}
|
| 172 |
)
|
|
|
|
| 195 |
"",
|
| 196 |
],
|
| 197 |
"Total Percentage Remaining": [
|
| 198 |
+
"%",
|
| 199 |
],
|
| 200 |
}
|
| 201 |
)
|
|
|
|
| 225 |
"",
|
| 226 |
],
|
| 227 |
"Total Percentage Remaining": [
|
| 228 |
+
"%",
|
| 229 |
],
|
| 230 |
}
|
| 231 |
)
|
|
|
|
| 255 |
"",
|
| 256 |
],
|
| 257 |
"Total Percentage Remaining": [
|
| 258 |
+
"%",
|
| 259 |
],
|
| 260 |
}
|
| 261 |
)
|
|
|
|
| 284 |
"",
|
| 285 |
],
|
| 286 |
"Total Percentage Remaining": [
|
| 287 |
+
"%",
|
| 288 |
],
|
| 289 |
}
|
| 290 |
)
|
|
|
|
| 313 |
"",
|
| 314 |
],
|
| 315 |
"Total Percentage Remaining": [
|
| 316 |
+
"%",
|
| 317 |
],
|
| 318 |
}
|
| 319 |
)
|
|
|
|
| 342 |
"",
|
| 343 |
],
|
| 344 |
"Total Percentage Remaining": [
|
| 345 |
+
"%",
|
| 346 |
],
|
| 347 |
}
|
| 348 |
)
|
|
|
|
| 371 |
"",
|
| 372 |
],
|
| 373 |
"Total Percentage Remaining": [
|
| 374 |
+
"%",
|
| 375 |
],
|
| 376 |
}
|
| 377 |
)
|
|
|
|
| 400 |
"",
|
| 401 |
],
|
| 402 |
"Total Percentage Remaining": [
|
| 403 |
+
"%",
|
| 404 |
],
|
| 405 |
}
|
| 406 |
)
|
|
|
|
| 429 |
"",
|
| 430 |
],
|
| 431 |
"Total Percentage Remaining": [
|
| 432 |
+
"%",
|
| 433 |
],
|
| 434 |
}
|
| 435 |
)
|
|
|
|
| 445 |
H3("Wikipedia"),
|
| 446 |
H4("Download and Extraction"),
|
| 447 |
Ol(
|
| 448 |
+
Li("The Wikimedia dataset was downloaded from the official snapshot on Huggingface", A("https://huggingface.co/datasets/wikimedia/wikipedia/tree/main", href="https://huggingface.co/datasets/wikimedia/wikipedia/tree/main")),
|
| 449 |
+
Li("Data is originally in parqet format so we used the", D_code("huggingface dataset.to_json"), " function to convert the data to the jsonl format"),
|
| 450 |
),
|
| 451 |
H4("Filtering"),
|
| 452 |
Ol(
|
|
|
|
| 456 |
Ol(
|
| 457 |
Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
|
| 458 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
table_div_wikipedia,
|
| 460 |
|
| 461 |
),
|
|
|
|
| 481 |
Ol(
|
| 482 |
Li("Local dedup was done with all papers combined."),
|
| 483 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
table_div_arx,
|
| 485 |
),
|
| 486 |
Section(
|