Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Update overview.py
Browse files- overview.py +25 -4
 
    	
        overview.py
    CHANGED
    
    | 
         @@ -11,7 +11,7 @@ import web 
     | 
|
| 11 | 
         
             
            import common
         
     | 
| 12 | 
         
             
            import results
         
     | 
| 13 | 
         | 
| 14 | 
         
            -
             
     | 
| 15 | 
         
             
                    {
         
     | 
| 16 | 
         
             
                        "Dataset": [
         
     | 
| 17 | 
         
             
                            "TxT360",
         
     | 
| 
         @@ -83,6 +83,26 @@ dataset_comparison = pd.DataFrame( 
     | 
|
| 83 | 
         
             
                            "-",
         
     | 
| 84 | 
         
             
                            "Included",
         
     | 
| 85 | 
         
             
                        ],
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 86 | 
         
             
                        "PG-19": [
         
     | 
| 87 | 
         
             
                            "Included",
         
     | 
| 88 | 
         
             
                            "-",
         
     | 
| 
         @@ -146,8 +166,8 @@ dataset_comparison = pd.DataFrame( 
     | 
|
| 146 | 
         
             
                    }
         
     | 
| 147 | 
         
             
                )
         
     | 
| 148 | 
         | 
| 149 | 
         
            -
             
     | 
| 150 | 
         
            -
             
     | 
| 151 | 
         | 
| 152 | 
         
             
            dataset_sources = pd.DataFrame(
         
     | 
| 153 | 
         
             
                    {
         
     | 
| 
         @@ -259,7 +279,8 @@ both critical for effective LLM pre-training."""), 
     | 
|
| 259 | 
         
             
                        P("By integrating the extensive reach of web data with the exceptional quality of curated sources, TxT360 is crafted to meet and surpass the rigorous standards required for state-of-the-art LLM pre-training."),
         
     | 
| 260 | 
         
             
                        H3("TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered."),
         
     | 
| 261 | 
         
             
                        P("Table 1: The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
         
     | 
| 262 | 
         
            -
                         
     | 
| 
         | 
|
| 263 | 
         
             
                        P("Table 2: Basic TxT360 Statistics."),
         
     | 
| 264 | 
         
             
                        table_div1,
         
     | 
| 265 | 
         
             
                    ),
         
     | 
| 
         | 
|
| 11 | 
         
             
            import common
         
     | 
| 12 | 
         
             
            import results
         
     | 
| 13 | 
         | 
| 14 | 
         
            +
            dataset_comparison1 = pd.DataFrame(
         
     | 
| 15 | 
         
             
                    {
         
     | 
| 16 | 
         
             
                        "Dataset": [
         
     | 
| 17 | 
         
             
                            "TxT360",
         
     | 
| 
         | 
|
| 83 | 
         
             
                            "-",
         
     | 
| 84 | 
         
             
                            "Included",
         
     | 
| 85 | 
         
             
                        ],
         
     | 
| 86 | 
         
            +
                        
         
     | 
| 87 | 
         
            +
                    }
         
     | 
| 88 | 
         
            +
                )
         
     | 
| 89 | 
         
            +
             
     | 
| 90 | 
         
            +
            table_html = dataset_comparison1.to_html(index=False, border=0)
         
     | 
| 91 | 
         
            +
            table_div_1 = Div(NotStr(table_html), style="margin: 40px;")
         
     | 
| 92 | 
         
            +
             
     | 
| 93 | 
         
            +
            dataset_comparison2 = pd.DataFrame(
         
     | 
| 94 | 
         
            +
                    {
         
     | 
| 95 | 
         
            +
                        "Dataset": [
         
     | 
| 96 | 
         
            +
                            "TxT360",
         
     | 
| 97 | 
         
            +
                            "FineWeb",
         
     | 
| 98 | 
         
            +
                            "RefinedWeb",
         
     | 
| 99 | 
         
            +
                            "RedPajama-v2",
         
     | 
| 100 | 
         
            +
                            "C4",
         
     | 
| 101 | 
         
            +
                            "Dolma",
         
     | 
| 102 | 
         
            +
                            "RedPajama-v1",
         
     | 
| 103 | 
         
            +
                            "The Pile",
         
     | 
| 104 | 
         
            +
                        ],        
         
     | 
| 105 | 
         
            +
                        
         
     | 
| 106 | 
         
             
                        "PG-19": [
         
     | 
| 107 | 
         
             
                            "Included",
         
     | 
| 108 | 
         
             
                            "-",
         
     | 
| 
         | 
|
| 166 | 
         
             
                    }
         
     | 
| 167 | 
         
             
                )
         
     | 
| 168 | 
         | 
| 169 | 
         
            +
            table_html2 = dataset_comparison2.to_html(index=False, border=0)
         
     | 
| 170 | 
         
            +
            table_div2 = Div(NotStr(table_html2), style="margin: 40px;")
         
     | 
| 171 | 
         | 
| 172 | 
         
             
            dataset_sources = pd.DataFrame(
         
     | 
| 173 | 
         
             
                    {
         
     | 
| 
         | 
|
| 279 | 
         
             
                        P("By integrating the extensive reach of web data with the exceptional quality of curated sources, TxT360 is crafted to meet and surpass the rigorous standards required for state-of-the-art LLM pre-training."),
         
     | 
| 280 | 
         
             
                        H3("TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered."),
         
     | 
| 281 | 
         
             
                        P("Table 1: The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
         
     | 
| 282 | 
         
            +
                        table_div1,
         
     | 
| 283 | 
         
            +
                        table_div2,
         
     | 
| 284 | 
         
             
                        P("Table 2: Basic TxT360 Statistics."),
         
     | 
| 285 | 
         
             
                        table_div1,
         
     | 
| 286 | 
         
             
                    ),
         
     |