Spaces:

marimo-team
/

iceberg

Running

App Files Files Community

koaning commited on May 14

Commit

34548a0

verified ·

1 Parent(s): f85e02e

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -3

app.py CHANGED Viewed

@@ -66,12 +66,47 @@ def _(df_taxi):
     return
 @app.cell
 def _(IdentityTransform, PartitionField, PartitionSpec):
     spec = PartitionSpec(
         PartitionField(source_id=3, field_id=1000, name="passenger_count", transform=IdentityTransform())
     )
-    return
 @app.cell
@@ -81,12 +116,13 @@ def _(df_taxi):
 @app.cell
-def _(catalog, df_taxi):
     catalog.create_namespace_if_not_exists("default")
     table = catalog.create_table_if_not_exists(
         "default.taxi",
         schema=df_taxi.schema,
     )
     return (table,)
@@ -142,7 +178,7 @@ def _(mo):
         r"""
     That's a bunch slower!
-    A part of the reason is that iceberg had partitions in it, which is great, but the comparison with `read_csv` is a bit unfair. Let's convert the `.csv` file to `.parquet` and also add a partition in polars with statistics. You will now see that we get a similar performance.
     """
     )
     return

     return
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""Let's now take this pyarrow dataframe and prepare it for insertion. We want to extract the right schema and also add a partition.""")
+    return
+@app.cell
+def _(df_taxi):
+    import pyarrow as pa
+    from pyiceberg.schema import Schema
+    from pyiceberg.types import (
+        NestedField, IntegerType, StringType, DoubleType, TimestampType
+    )
+    from pyiceberg.table.name_mapping import NameMapping, MappedField
+    from pyiceberg.io.pyarrow import pyarrow_to_schema
+    # Create a mapping from column names to field IDs
+    name_mapping_fields = []
+    for idx, field_name in enumerate(df_taxi.column_names, start=1):
+        name_mapping_fields.append(MappedField(field_id=idx, names=[field_name]))
+    # Create a name mapping
+    name_mapping = NameMapping(name_mapping_fields)
+    # Convert PyArrow schema to Iceberg schema
+    iceberg_schema = pyarrow_to_schema(df_taxi.schema, name_mapping)
+    # Now find the field ID for 'passenger_count'
+    passenger_count_field = iceberg_schema.find_field("passenger_count")
+    source_id = passenger_count_field.field_id
+    print(f"The source_id for 'passenger_count' is: {source_id}")
+    return
 @app.cell
 def _(IdentityTransform, PartitionField, PartitionSpec):
     spec = PartitionSpec(
         PartitionField(source_id=3, field_id=1000, name="passenger_count", transform=IdentityTransform())
     )
+    return (spec,)
 @app.cell
 @app.cell
+def _(catalog, df_taxi, spec):
     catalog.create_namespace_if_not_exists("default")
     table = catalog.create_table_if_not_exists(
         "default.taxi",
         schema=df_taxi.schema,
+        partition_spec=spec
     )
     return (table,)
         r"""
     That's a bunch slower!
+    A part of the reason is that iceberg had partitions in it, which is great, but the comparison with `read_csv` is a bit unfair. Let's convert the `.csv` file to `.parquet` and also add a partition in polars with statistics. You will now see that we get a similar performance.
     """
     )
     return