Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Update curated.py
Browse files- curated.py +60 -0
 
    	
        curated.py
    CHANGED
    
    | 
         @@ -436,6 +436,35 @@ s2o_filter = pd.DataFrame( 
     | 
|
| 436 | 
         
             
            table_html_s2o = s2o_filter.to_html(index=False, border=0)
         
     | 
| 437 | 
         
             
            table_div_s2o = Div(NotStr(table_html_s2o))
         
     | 
| 438 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 439 | 
         
             
            med_filter = pd.DataFrame(
         
     | 
| 440 | 
         
             
                {
         
     | 
| 441 | 
         
             
                    "Dataset": [
         
     | 
| 
         @@ -465,6 +494,35 @@ med_filter = pd.DataFrame( 
     | 
|
| 465 | 
         
             
            table_html_med = med_filter.to_html(index=False, border=0)
         
     | 
| 466 | 
         
             
            table_div_med = Div(NotStr(table_html_med))
         
     | 
| 467 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 468 | 
         
             
            phil_filter = pd.DataFrame(
         
     | 
| 469 | 
         
             
                {
         
     | 
| 470 | 
         
             
                    "Dataset": [
         
     | 
| 
         @@ -855,6 +913,7 @@ filtering_process = Div( 
     | 
|
| 855 | 
         
             
                                style="margin-bottom: -3px",
         
     | 
| 856 | 
         
             
                            ),
         
     | 
| 857 | 
         
             
                        ),
         
     | 
| 
         | 
|
| 858 | 
         
             
                        #Details(
         
     | 
| 859 | 
         
             
                        #    Summary("S2ORC Abstract Filtering Examples "),
         
     | 
| 860 | 
         
             
                       #     Div(
         
     | 
| 
         @@ -914,6 +973,7 @@ filtering_process = Div( 
     | 
|
| 914 | 
         
             
                            ),
         
     | 
| 915 | 
         
             
                        ),
         
     | 
| 916 | 
         
             
                        table_div_med,
         
     | 
| 
         | 
|
| 917 | 
         
             
                        Details(
         
     | 
| 918 | 
         
             
                            Summary("PubMed Filtering Examples"),
         
     | 
| 919 | 
         
             
                            Div(
         
     | 
| 
         | 
|
| 436 | 
         
             
            table_html_s2o = s2o_filter.to_html(index=False, border=0)
         
     | 
| 437 | 
         
             
            table_div_s2o = Div(NotStr(table_html_s2o))
         
     | 
| 438 | 
         | 
| 439 | 
         
            +
            s2oa_filter = pd.DataFrame(
         
     | 
| 440 | 
         
            +
                {
         
     | 
| 441 | 
         
            +
                    "Dataset": [
         
     | 
| 442 | 
         
            +
                        "S2ORC Abstract",
         
     | 
| 443 | 
         
            +
                    ],
         
     | 
| 444 | 
         
            +
                    "Lines Downloaded": [
         
     | 
| 445 | 
         
            +
                        "102324176",
         
     | 
| 446 | 
         
            +
                    ],
         
     | 
| 447 | 
         
            +
                    "Percent Removed After Language Filter": [
         
     | 
| 448 | 
         
            +
                        "18.04%",
         
     | 
| 449 | 
         
            +
                    ],
         
     | 
| 450 | 
         
            +
                    "Percent Removed After Min Word Count Filter": [
         
     | 
| 451 | 
         
            +
                        "1.17%",
         
     | 
| 452 | 
         
            +
                    ],
         
     | 
| 453 | 
         
            +
                    "Percent Removed After Unigram Probability Filter": [
         
     | 
| 454 | 
         
            +
                        "0.00%",
         
     | 
| 455 | 
         
            +
                    ],
         
     | 
| 456 | 
         
            +
                    "Percent Removed After Local Dedup": [
         
     | 
| 457 | 
         
            +
                        "0.13%",
         
     | 
| 458 | 
         
            +
                    ],
         
     | 
| 459 | 
         
            +
                    "Total Percentage Remaining": [
         
     | 
| 460 | 
         
            +
                        "80.66%",
         
     | 
| 461 | 
         
            +
                    ],
         
     | 
| 462 | 
         
            +
                }
         
     | 
| 463 | 
         
            +
            )
         
     | 
| 464 | 
         
            +
             
     | 
| 465 | 
         
            +
            table_html_s2oa = s2oa_filter.to_html(index=False, border=0)
         
     | 
| 466 | 
         
            +
            table_div_s2oa = Div(NotStr(table_html_s2oa))
         
     | 
| 467 | 
         
            +
             
     | 
| 468 | 
         
             
            med_filter = pd.DataFrame(
         
     | 
| 469 | 
         
             
                {
         
     | 
| 470 | 
         
             
                    "Dataset": [
         
     | 
| 
         | 
|
| 494 | 
         
             
            table_html_med = med_filter.to_html(index=False, border=0)
         
     | 
| 495 | 
         
             
            table_div_med = Div(NotStr(table_html_med))
         
     | 
| 496 | 
         | 
| 497 | 
         
            +
            pma_filter = pd.DataFrame(
         
     | 
| 498 | 
         
            +
                {
         
     | 
| 499 | 
         
            +
                    "Dataset": [
         
     | 
| 500 | 
         
            +
                        "PubMed - Abstract",
         
     | 
| 501 | 
         
            +
                    ],
         
     | 
| 502 | 
         
            +
                    "Lines Downloaded": [
         
     | 
| 503 | 
         
            +
                        "25787474",
         
     | 
| 504 | 
         
            +
                    ],
         
     | 
| 505 | 
         
            +
                    "Percent Removed After Language Filter": [
         
     | 
| 506 | 
         
            +
                        "0.01%",
         
     | 
| 507 | 
         
            +
                    ],
         
     | 
| 508 | 
         
            +
                    "Percent Removed After Min Word Count Filter": [
         
     | 
| 509 | 
         
            +
                        "0.14%",
         
     | 
| 510 | 
         
            +
                    ],
         
     | 
| 511 | 
         
            +
                    "Percent Removed After Unigram Probability Filter": [
         
     | 
| 512 | 
         
            +
                        "0.00%",
         
     | 
| 513 | 
         
            +
                    ],
         
     | 
| 514 | 
         
            +
                    "Percent Removed After Local Dedup": [
         
     | 
| 515 | 
         
            +
                        "0.00%",
         
     | 
| 516 | 
         
            +
                    ],
         
     | 
| 517 | 
         
            +
                    "Total Percentage Remaining": [
         
     | 
| 518 | 
         
            +
                        "98.85%",
         
     | 
| 519 | 
         
            +
                    ],
         
     | 
| 520 | 
         
            +
                }
         
     | 
| 521 | 
         
            +
            )
         
     | 
| 522 | 
         
            +
             
     | 
| 523 | 
         
            +
            table_html_pma = pma_filter.to_html(index=False, border=0)
         
     | 
| 524 | 
         
            +
            table_div_pma = Div(NotStr(table_html_pma))
         
     | 
| 525 | 
         
            +
             
     | 
| 526 | 
         
             
            phil_filter = pd.DataFrame(
         
     | 
| 527 | 
         
             
                {
         
     | 
| 528 | 
         
             
                    "Dataset": [
         
     | 
| 
         | 
|
| 913 | 
         
             
                                style="margin-bottom: -3px",
         
     | 
| 914 | 
         
             
                            ),
         
     | 
| 915 | 
         
             
                        ),
         
     | 
| 916 | 
         
            +
                        table_div_s2oa,
         
     | 
| 917 | 
         
             
                        #Details(
         
     | 
| 918 | 
         
             
                        #    Summary("S2ORC Abstract Filtering Examples "),
         
     | 
| 919 | 
         
             
                       #     Div(
         
     | 
| 
         | 
|
| 973 | 
         
             
                            ),
         
     | 
| 974 | 
         
             
                        ),
         
     | 
| 975 | 
         
             
                        table_div_med,
         
     | 
| 976 | 
         
            +
                        table_div_pma,
         
     | 
| 977 | 
         
             
                        Details(
         
     | 
| 978 | 
         
             
                            Summary("PubMed Filtering Examples"),
         
     | 
| 979 | 
         
             
                            Div(
         
     |