Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Update web.py
Browse files
    	
        web.py
    CHANGED
    
    | @@ -450,10 +450,11 @@ def web_data(): | |
| 450 | 
             
                    ),
         | 
| 451 |  | 
| 452 | 
             
                    #DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
         | 
| 453 | 
            -
             | 
| 454 | 
            -
                    
         | 
| 455 | 
            -
             | 
| 456 | 
            -
             | 
|  | |
| 457 |  | 
| 458 | 
             
                    H4("1.3 URL Filtering"),
         | 
| 459 | 
             
                    P("""
         | 
| @@ -466,12 +467,18 @@ def web_data(): | |
| 466 | 
             
                    articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
         | 
| 467 | 
             
                    4.6M domain names in the UT1 blocklist. 24 URL domains were detected with more than 4k matches, which are shown below.
         | 
| 468 | 
             
                    """),
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 469 |  | 
| 470 | 
            -
                    DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
         | 
| 471 | 
             
                    P("""
         | 
| 472 | 
             
                    We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
         | 
| 473 | 
             
                    """),
         | 
|  | |
| 474 | 
             
                    DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
         | 
|  | |
| 475 | 
             
                    DV(
         | 
| 476 | 
             
                        "data/bad_url_doc.jsonl",
         | 
| 477 | 
             
                        3,
         | 
|  | |
| 450 | 
             
                    ),
         | 
| 451 |  | 
| 452 | 
             
                    #DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
         | 
| 453 | 
            +
             | 
| 454 | 
            +
                    Details(
         | 
| 455 | 
            +
                        Summary("English Documents Scoring Lower than 0.65"),
         | 
| 456 | 
            +
                        DV("data/sample_en_low.json", 3, "Sample documents that are classified as English but with score less than 0.65"),
         | 
| 457 | 
            +
                    ),
         | 
| 458 |  | 
| 459 | 
             
                    H4("1.3 URL Filtering"),
         | 
| 460 | 
             
                    P("""
         | 
|  | |
| 467 | 
             
                    articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
         | 
| 468 | 
             
                    4.6M domain names in the UT1 blocklist. 24 URL domains were detected with more than 4k matches, which are shown below.
         | 
| 469 | 
             
                    """),
         | 
| 470 | 
            +
             | 
| 471 | 
            +
                    Details(
         | 
| 472 | 
            +
                        Summary("24 URL domains with more than 4k matches"),
         | 
| 473 | 
            +
                        DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
         | 
| 474 | 
            +
                    ),
         | 
| 475 |  | 
|  | |
| 476 | 
             
                    P("""
         | 
| 477 | 
             
                    We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
         | 
| 478 | 
             
                    """),
         | 
| 479 | 
            +
                    
         | 
| 480 | 
             
                    DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
         | 
| 481 | 
            +
                    
         | 
| 482 | 
             
                    DV(
         | 
| 483 | 
             
                        "data/bad_url_doc.jsonl",
         | 
| 484 | 
             
                        3,
         | 
