omwdataset

Runtime error

App Files Files Community

victormiller commited on Oct 3, 2024

Commit

8061116

verified ·

1 Parent(s): ec2b3ce

Update main.py

Browse files

Files changed (1) hide show

main.py +162 -0

main.py CHANGED Viewed

@@ -175,6 +175,167 @@ def main():
     )
 dataset_comparison1 = pd.DataFrame(
     {
         "Dataset": [
@@ -474,6 +635,7 @@ def intro():
             H3(
                 "TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
             ),
             table_div_1,
             table_div_2,
             P(

     )
+new_dataset_comparison1 = pd.DataFrame(
+    {
+        "Data Source": [
+            "CommonCrawl",
+            "Papers",
+            "Wikipedia",
+            "FreeLaw",
+            "DM Math",
+            "USPTO",
+            "PG-19",
+            "HackerNews",
+            "Ubuntu IRC",
+            "EuroParl",
+            "StackExchange",
+            "Code",
+        ],
+        "TxT360": [
+            "99 Snapshots",
+            "5 Sources",
+            "310+ Languages",
+            "Included",
+            "Included",
+            "Included",
+            "Included",
+            "Included",
+            "Included",
+            "Included",
+            "Included",
+            "**",
+        ],
+        "FineWeb": [
+            "96 Snapshots",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+        ],
+        "RefinedWeb": [
+            "90 Snapshots",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+        ],
+        "PedPajama-V-2": [
+            "84 Snapshots",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+        ],
+        "C4": [
+            "1 Snapshots",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+            "-",
+        ],
+        "Dolma": [
+            "24 Snapshots",
+            "1 Source",
+            "checkmark",
+            "-",
+            "-",
+            "-",
+            "Included",
+            "-",
+            "-",
+            "-",
+            "-",
+            "Included",
+        ],
+        "RedPajama-V-1": [
+            "5 Snapshots",
+            "1 Source",
+            "checkmark",
+            "",
+            " ",
+            "",
+           "Included",
+            "-",
+            "-",
+            "-",
+            "Included",
+            "Included",
+        ],
+        "The Pile": [
+            "0.6% of 74 Snapshots",
+            "4 Sources",
+            "English Only",
+            "Included",
+            "Included",
+            "Included",
+            "Included",
+            "Included",
+            "Included",
+            "Included",
+            "Included",
+            "Included",
+        ],
+    }
+)
+# Apply table styling: Light green for the header, alternating white and light grey for rows
+styled_table = (
+    new_dataset_comparison1.style.set_properties(
+        **{"background-color": "#E1EEDB"},
+        subset=pd.IndexSlice[0, :],  # Row 0 with a light green background
+    )
+    .apply(
+        lambda x: [
+            "background-color: #E1EEDB"
+            if i == 0
+            else (
+                "background-color: rgb(237, 242, 251)"
+                if i % 2 == 0
+                else "background-color: white"
+            )
+            for i in range(len(x))
+        ],
+        axis=0,
+    )
+    .hide(axis="index")
+)  # Hide the row index
+# Use _repr_html_() method to get the HTML representation of the styled DataFrame
+table_html = styled_table._repr_html_()
+# table_html = dataset_comparison1.to_html(index=False, border=0)
+new_table_div_1 = Div(NotStr(table_html), style="margin: 40px;")
 dataset_comparison1 = pd.DataFrame(
     {
         "Dataset": [
             H3(
                 "TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
             ),
+            new_table_div_1,
             table_div_1,
             table_div_2,
             P(