Update curated.py
Browse files- curated.py +65 -8
curated.py
CHANGED
|
@@ -514,6 +514,63 @@ freelaw_examples = Div(
|
|
| 514 |
),
|
| 515 |
)
|
| 516 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
filtering_process = Div(
|
| 518 |
Section(
|
| 519 |
H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
|
|
@@ -643,10 +700,10 @@ filtering_process = Div(
|
|
| 643 |
Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
|
| 644 |
),
|
| 645 |
table_div_phil,
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
),
|
| 651 |
),
|
| 652 |
Section(
|
|
@@ -751,10 +808,10 @@ filtering_process = Div(
|
|
| 751 |
Li("Minimum Word Count Filter: 10"),
|
| 752 |
),
|
| 753 |
table_div_se,
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
| 758 |
),
|
| 759 |
),
|
| 760 |
Section(
|
|
|
|
| 514 |
),
|
| 515 |
)
|
| 516 |
|
| 517 |
+
def get_se_data(data_source: str = "StackExchange", doc_id: int = 3, target: str = "foo"):
|
| 518 |
+
doc_id = max(0, min(int(doc_id), 9))
|
| 519 |
+
|
| 520 |
+
if data_source == "StackExchange":
|
| 521 |
+
raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json"))
|
| 522 |
+
extracted_sample_doc = json.load(
|
| 523 |
+
open("data/curated_samples/stackexchange_extract.json")
|
| 524 |
+
)
|
| 525 |
+
else:
|
| 526 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 527 |
+
|
| 528 |
+
raw_json = raw_sample_doc[doc_id]
|
| 529 |
+
extracted_json = extracted_sample_doc[doc_id]
|
| 530 |
+
return view_data(
|
| 531 |
+
raw_json,
|
| 532 |
+
extracted_json,
|
| 533 |
+
doc_id=doc_id,
|
| 534 |
+
data_source="StackExchange",
|
| 535 |
+
data_sources="StackExchange",
|
| 536 |
+
target=target,
|
| 537 |
+
)
|
| 538 |
+
|
| 539 |
+
se_examples = Div(
|
| 540 |
+
Div(
|
| 541 |
+
get_se_data(target=gen_random_id()),
|
| 542 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
| 543 |
+
),
|
| 544 |
+
)
|
| 545 |
+
|
| 546 |
+
def get_phil_data(data_source: str = "PhilPapers", doc_id: int = 3, target: str = "foo"):
|
| 547 |
+
doc_id = max(0, min(int(doc_id), 9))
|
| 548 |
+
|
| 549 |
+
if data_source == "PhilPapers":
|
| 550 |
+
raw_sample_doc = extracted_sample_doc = json.load(
|
| 551 |
+
open("data/curated_samples/philpapers_raw.json")
|
| 552 |
+
)
|
| 553 |
+
else:
|
| 554 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
| 555 |
+
|
| 556 |
+
raw_json = raw_sample_doc[doc_id]
|
| 557 |
+
extracted_json = extracted_sample_doc[doc_id]
|
| 558 |
+
return view_data(
|
| 559 |
+
raw_json,
|
| 560 |
+
extracted_json,
|
| 561 |
+
doc_id=doc_id,
|
| 562 |
+
data_source="PhilPapers",
|
| 563 |
+
data_sources="PhilPapers",
|
| 564 |
+
target=target,
|
| 565 |
+
)
|
| 566 |
+
|
| 567 |
+
phil_examples = Div(
|
| 568 |
+
Div(
|
| 569 |
+
get_phil_data(target=gen_random_id()),
|
| 570 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
| 571 |
+
),
|
| 572 |
+
)
|
| 573 |
+
|
| 574 |
filtering_process = Div(
|
| 575 |
Section(
|
| 576 |
H3("This section contains the specific filtering steps taken for all 14 curated datasets."),
|
|
|
|
| 700 |
Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
|
| 701 |
),
|
| 702 |
table_div_phil,
|
| 703 |
+
Details(
|
| 704 |
+
Summary("Phil Papers Filtering Examples"),
|
| 705 |
+
phil_examples,
|
| 706 |
+
),
|
| 707 |
),
|
| 708 |
),
|
| 709 |
Section(
|
|
|
|
| 808 |
Li("Minimum Word Count Filter: 10"),
|
| 809 |
),
|
| 810 |
table_div_se,
|
| 811 |
+
Details(
|
| 812 |
+
Summary("StackExchange Filtering Examples"),
|
| 813 |
+
se_examples,
|
| 814 |
+
),
|
| 815 |
),
|
| 816 |
),
|
| 817 |
Section(
|