Update curated.py
Browse files- curated.py +4 -206
curated.py
CHANGED
|
@@ -645,101 +645,6 @@ def get_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo")
|
|
| 645 |
)
|
| 646 |
|
| 647 |
|
| 648 |
-
def get_chart_28168342():
|
| 649 |
-
fig = go.Figure()
|
| 650 |
-
filter_names = [
|
| 651 |
-
"Download",
|
| 652 |
-
"Language",
|
| 653 |
-
"Min word count",
|
| 654 |
-
"Title Abstract",
|
| 655 |
-
"Majority language",
|
| 656 |
-
"Paragraph count",
|
| 657 |
-
"Frequency",
|
| 658 |
-
"Unigram log probability",
|
| 659 |
-
"Local dedup",
|
| 660 |
-
]
|
| 661 |
-
|
| 662 |
-
data_sources = [
|
| 663 |
-
("Wikipedia", [61614907, 61614907, 60468491, 60468491, 60468491, 60468491, 60468491, 60468491, 20]),
|
| 664 |
-
("Freelaw", [75971288, 73690766, 68171834, 68171834, 68171834, 68171834, 68171834, 68123174, 20]),
|
| 665 |
-
("DM Maths", [112559888, 112559888, 112559888, 112559888, 112559888, 112559888, 112559888, 112559888, 20]),
|
| 666 |
-
("USPTO", [6880276, 6878964, 6749922, 6749922, 6749922, 6749922, 6749922, 6749389, 20]),
|
| 667 |
-
("PG19", [28752, 28683, 28682, 28682, 28682, 28682, 28682, 28632, 20]),
|
| 668 |
-
("Hackernews", [2064931, 2010802, 2010488, 2010488, 2010488, 2010488, 2010488, 2003636, 20]),
|
| 669 |
-
("Ubuntu IRC", [37966, 23501, 23468, 23468, 23468, 23468, 23468, 23205, 20]),
|
| 670 |
-
("Europarl", [69814, 69814,69814,69814,69814,69814,69814,69814, 20]),
|
| 671 |
-
("StackExchange", [23246548, 23246548, 23246352, 23246352, 23246352, 23246352, 23246352, 23246352, 20]),
|
| 672 |
-
("Arxiv", [1911867, 1869441, 1763840, 1763840, 1763840, 1763840, 1763840, 1762661, 20]),
|
| 673 |
-
("S2ORC", [12963563, 12963563, 12963563, 10731113, 9455620, 9306816, 8055147, 8055147, 20]),
|
| 674 |
-
("S2ORC Abstract", [102324176, 83867601, 82889293, 82889293, 82889293, 82889293, 82889293, 82777912, 20]),
|
| 675 |
-
("PubMed Central", [5230932, 4830486, 4768310, 4768310, 4768310, 4768310, 4768310, 4767474, 20]),
|
| 676 |
-
("PubMed Central Abstract", [25787474, 25784374, 25747955, 25747955, 25747955, 25747955, 25747955, 25746724, 20]),
|
| 677 |
-
("PhilPapers", [49389, 39175, 39175, 39175, 39175, 39175, 39175, 39128, 20]),
|
| 678 |
-
]
|
| 679 |
-
|
| 680 |
-
for name, x_values in data_sources:
|
| 681 |
-
fig.add_trace(
|
| 682 |
-
go.Funnel(
|
| 683 |
-
name=name,
|
| 684 |
-
orientation="h",
|
| 685 |
-
y=filter_names,
|
| 686 |
-
x=x_values,
|
| 687 |
-
textinfo="value+percent total",
|
| 688 |
-
textposition="inside",
|
| 689 |
-
)
|
| 690 |
-
)
|
| 691 |
-
|
| 692 |
-
fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)")
|
| 693 |
-
return fig
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
def get_chart_new():
|
| 697 |
-
fig = go.Figure()
|
| 698 |
-
filter_names = [
|
| 699 |
-
"Download",
|
| 700 |
-
"Language",
|
| 701 |
-
"Min word count",
|
| 702 |
-
"Title Abstract",
|
| 703 |
-
"Majority language",
|
| 704 |
-
"Paragraph count",
|
| 705 |
-
"Frequency",
|
| 706 |
-
"Unigram log probability",
|
| 707 |
-
"Local dedup",
|
| 708 |
-
]
|
| 709 |
-
|
| 710 |
-
data_sources = [
|
| 711 |
-
("Wikipedia", [61614907, 0, 1146416, 0, 0, 0, 0, 0, 20]),
|
| 712 |
-
("Freelaw", [75971288, 2280522, 5518932, 0, 0, 0, 0, 48660, 20]),
|
| 713 |
-
("DM Maths", [112559888, 0, 0, 0, 0, 0, 0, 0, 20]),
|
| 714 |
-
("USPTO", [6880276, 1312, 129042, 0, 0, 0, 0, 533, 20]),
|
| 715 |
-
("PG19", [28752, 69, 1, 0, 0, 0, 0, 50, 20]),
|
| 716 |
-
("Hackernews", [2064931, 54129, 314, 0, 0, 0, 0, 6852, 20]),
|
| 717 |
-
("Ubuntu IRC", [37966, 14465, 33, 0, 0, 0, 0, 263, 20]),
|
| 718 |
-
("Europarl", [69814, 0, 0, 0, 0, 0, 0, 0, 20]),
|
| 719 |
-
("StackExchange", [23246548, 0, 196, 0, 0, 0, 0, 0, 20]),
|
| 720 |
-
("Arxiv", [1911867, 42426, 105601, 0, 0, 0, 0, 1179, 20]),
|
| 721 |
-
("S2ORC", [12963563, 0, 0, 2232450, 1275493, 148804, 1251669, 0, 20]),
|
| 722 |
-
("S2ORC Abstract", [102324176, 18456575, 978308, 0, 0, 0, 0, 111381, 20]),
|
| 723 |
-
("PubMed Central", [5230932, 400446, 62176, 0, 0, 0, 0, 836, 20]),
|
| 724 |
-
("PubMed Central Abstract", [25787474, 3100, 36419, 0, 0, 0, 0, 1231, 20]),
|
| 725 |
-
("PhilPapers", [49389, 10214, 0, 0, 0, 0, 0, 47, 20]),
|
| 726 |
-
]
|
| 727 |
-
|
| 728 |
-
for name, x_values in data_sources:
|
| 729 |
-
fig.add_trace(
|
| 730 |
-
go.Funnel(
|
| 731 |
-
name=name,
|
| 732 |
-
orientation="h",
|
| 733 |
-
y=filter_names,
|
| 734 |
-
x=x_values,
|
| 735 |
-
textinfo="value+percent total",
|
| 736 |
-
textposition="inside",
|
| 737 |
-
)
|
| 738 |
-
)
|
| 739 |
-
|
| 740 |
-
fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)")
|
| 741 |
-
return fig
|
| 742 |
-
|
| 743 |
def update(target: str, request):
|
| 744 |
params = request.query_params
|
| 745 |
if data_source := params.get(f"data_source_{target}"):
|
|
@@ -749,113 +654,6 @@ def update(target: str, request):
|
|
| 749 |
return get_data(
|
| 750 |
params.get(f"data_source_{target}"), doc_id, target)
|
| 751 |
|
| 752 |
-
|
| 753 |
-
# Creating the dataframe from the provided table data
|
| 754 |
-
data = {
|
| 755 |
-
'Dataset': ['Wikipedia', 'Freelaw', 'DM Maths', 'USPTO', 'PG19', 'Hackernews', 'Ubuntu IRC', 'Europarl',
|
| 756 |
-
'StackExchange', 'Arxiv', 'S2ORC', 'S2ORC Abstract', 'Pubmed Central', 'Pubmed Abstract', 'Phil Papers'],
|
| 757 |
-
'Downloaded Lines': [61614907, 75971288, 112559888, 6880276, 28752, 2064931, 37966, 69814, 23246548, 1911867,
|
| 758 |
-
12963563, 102324176, 5230932, 25787474, 49389],
|
| 759 |
-
'Language Filter': [0, 2280522, 0, 1312, 69, 54129, 14465, 0, 0, 42426, 0, 18456575, 400446, 3100, 10214],
|
| 760 |
-
'Min Word Count': [1146416, 5518932, 0, 129042, 1, 314, 33, 0, 196, 105601, 0, 978308, 62176, 36419, 0],
|
| 761 |
-
'Unigram log probability': [60468491, 68171834, 112559888, 6749922, 28682, 2010488, 23468, 69814, 23246352,
|
| 762 |
-
1763840, 12963563, 82889293, 4768310, 25747955, 39175],
|
| 763 |
-
'Total Lines Remaining': [60468491, 68123174, 112559888, 6749389, 28632, 2003636, 23205, 69814, 23246352,
|
| 764 |
-
1762661, 12963563, 82777912, 4767474, 25746724, 39128]
|
| 765 |
-
}
|
| 766 |
-
|
| 767 |
-
df = pd.DataFrame(data)
|
| 768 |
-
|
| 769 |
-
# Create the stacked bar chart
|
| 770 |
-
fig = go.Figure()
|
| 771 |
-
|
| 772 |
-
# Adding traces for each filter stage
|
| 773 |
-
fig.add_trace(go.Bar(
|
| 774 |
-
name='Language Filter',
|
| 775 |
-
x=df['Dataset'],
|
| 776 |
-
y=df['Language Filter']
|
| 777 |
-
))
|
| 778 |
-
|
| 779 |
-
fig.add_trace(go.Bar(
|
| 780 |
-
name='Min Word Count Filter',
|
| 781 |
-
x=df['Dataset'],
|
| 782 |
-
y=df['Min Word Count']
|
| 783 |
-
))
|
| 784 |
-
|
| 785 |
-
fig.add_trace(go.Bar(
|
| 786 |
-
name='Unigram log probability Filter',
|
| 787 |
-
x=df['Dataset'],
|
| 788 |
-
y=df['Unigram log probability']
|
| 789 |
-
))
|
| 790 |
-
|
| 791 |
-
fig.add_trace(go.Bar(
|
| 792 |
-
name='Total Lines Remaining',
|
| 793 |
-
x=df['Dataset'],
|
| 794 |
-
y=df['Total Lines Remaining']
|
| 795 |
-
))
|
| 796 |
-
|
| 797 |
-
# Update the layout
|
| 798 |
-
fig.update_layout(
|
| 799 |
-
barmode='stack',
|
| 800 |
-
title='Stacked Bar Chart of Line Reductions by Dataset',
|
| 801 |
-
xaxis_title='Dataset',
|
| 802 |
-
yaxis_title='Number of Lines',
|
| 803 |
-
legend_title='Filters',
|
| 804 |
-
height=600,
|
| 805 |
-
width=1000
|
| 806 |
-
)
|
| 807 |
-
|
| 808 |
-
# Show the plot
|
| 809 |
-
stacked_bar = fig
|
| 810 |
-
|
| 811 |
-
# Aggregating the data for filters and datasets
|
| 812 |
-
filter_data = {
|
| 813 |
-
'Filter': ['Language Filter', 'Min Word Count', 'Unigram log probability', 'Total Lines Remaining'],
|
| 814 |
-
'Wikipedia': [0, 1146416, 60468491, 60468491],
|
| 815 |
-
'Freelaw': [2280522, 5518932, 68171834, 68123174],
|
| 816 |
-
'DM Maths': [0, 0, 112559888, 112559888],
|
| 817 |
-
'USPTO': [1312, 129042, 6749922, 6749389],
|
| 818 |
-
'PG19': [69, 1, 28682, 28632],
|
| 819 |
-
'Hackernews': [54129, 314, 2010488, 2003636],
|
| 820 |
-
'Ubuntu IRC': [14465, 33, 23468, 23205],
|
| 821 |
-
'Europarl': [0, 0, 69814, 69814],
|
| 822 |
-
'StackExchange': [0, 196, 23246352, 23246352],
|
| 823 |
-
'Arxiv': [42426, 105601, 1763840, 1762661],
|
| 824 |
-
'S2ORC': [0, 0, 12963563, 12963563],
|
| 825 |
-
'S2ORC Abstract': [18456575, 978308, 82889293, 82777912],
|
| 826 |
-
'Pubmed Central': [400446, 62176, 4768310, 4767474],
|
| 827 |
-
'Pubmed Abstract': [3100, 36419, 25747955, 25746724],
|
| 828 |
-
'Phil Papers': [10214, 0, 39175, 39128]
|
| 829 |
-
}
|
| 830 |
-
|
| 831 |
-
# Creating a new dataframe for the filter data
|
| 832 |
-
filter_df = pd.DataFrame(filter_data)
|
| 833 |
-
|
| 834 |
-
# Creating the stacked bar chart
|
| 835 |
-
fig = go.Figure()
|
| 836 |
-
|
| 837 |
-
# Add trace for each dataset
|
| 838 |
-
for dataset in filter_df.columns[1:]:
|
| 839 |
-
fig.add_trace(go.Bar(
|
| 840 |
-
name=dataset,
|
| 841 |
-
x=filter_df['Filter'],
|
| 842 |
-
y=filter_df[dataset]
|
| 843 |
-
))
|
| 844 |
-
|
| 845 |
-
# Update the layout
|
| 846 |
-
fig.update_layout(
|
| 847 |
-
barmode='stack',
|
| 848 |
-
title='Stacked Bar Chart of Filters for Each Dataset',
|
| 849 |
-
xaxis_title='Filter',
|
| 850 |
-
yaxis_title='Number of Lines',
|
| 851 |
-
legend_title='Dataset',
|
| 852 |
-
height=600,
|
| 853 |
-
width=1000
|
| 854 |
-
)
|
| 855 |
-
|
| 856 |
-
# Show the plot
|
| 857 |
-
diff_stacked_bar = fig
|
| 858 |
-
|
| 859 |
# Data for the stacked bar chart
|
| 860 |
data = {
|
| 861 |
'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
|
|
@@ -1037,10 +835,10 @@ def curated(request):
|
|
| 1037 |
H2("Curated Sources Defined"),
|
| 1038 |
table_desc,
|
| 1039 |
data_preprocessing_div,
|
| 1040 |
-
|
| 1041 |
-
|
| 1042 |
-
|
| 1043 |
-
|
| 1044 |
plotly2fasthtml(diff2_stacked_bar),
|
| 1045 |
H2("Curated Sources Processing"),
|
| 1046 |
filtering_process,
|
|
|
|
| 645 |
)
|
| 646 |
|
| 647 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 648 |
def update(target: str, request):
|
| 649 |
params = request.query_params
|
| 650 |
if data_source := params.get(f"data_source_{target}"):
|
|
|
|
| 654 |
return get_data(
|
| 655 |
params.get(f"data_source_{target}"), doc_id, target)
|
| 656 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 657 |
# Data for the stacked bar chart
|
| 658 |
data = {
|
| 659 |
'Filter': ['Downloaded Lines', 'Language Filter', 'Min Word Count', 'Unigram Log Probability'],
|
|
|
|
| 835 |
H2("Curated Sources Defined"),
|
| 836 |
table_desc,
|
| 837 |
data_preprocessing_div,
|
| 838 |
+
# plotly2fasthtml(get_chart_28168342()),
|
| 839 |
+
# plotly2fasthtml(get_chart_new()),
|
| 840 |
+
# plotly2fasthtml(stacked_bar),
|
| 841 |
+
# plotly2fasthtml(diff_stacked_bar),
|
| 842 |
plotly2fasthtml(diff2_stacked_bar),
|
| 843 |
H2("Curated Sources Processing"),
|
| 844 |
filtering_process,
|