Spaces:

datajoi
/

Dataset-Test-Workflow

Sleeping

App Files Files Community

Mustehson commited on Nov 14, 2024

Commit

5f9d608

1 Parent(s): ded67a4

DLT Pipeline

Browse files

Files changed (2) hide show

app.py +53 -5
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
 from langsmith import traceable
 from langchain import hub
 import warnings
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 # Height of the Tabs Text Area
@@ -63,9 +64,44 @@ def get_tables_names(schema_name):
 def update_table_names(schema_name):
     tables = get_tables_names(schema_name)
     return gr.update(choices=tables)
-def get_data_df(schema):
-    print('Getting Dataframe from the Database')
-    return conn.sql(f"SELECT * FROM {schema} LIMIT 1000").df()
 def df_summary(df):
     summary = []
@@ -205,7 +241,14 @@ def statistics(df):
 # Main Function
 def main(table):
     schema = get_table_schema(table)
-    df = get_data_df(schema)
     df_statistics, df_alerts = statistics(df)
     describe_num, describe_cat  = describe(df)
@@ -226,7 +269,12 @@ def main(table):
 def user_results(table, text_query):
     schema = get_table_schema(table)
-    df = get_data_df(schema)
     messages = format_user_prompt(df=df, user_description=text_query)
     tests = run_llm(messages)

 from langsmith import traceable
 from langchain import hub
 import warnings
+import dlt
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 # Height of the Tabs Text Area
 def update_table_names(schema_name):
     tables = get_tables_names(schema_name)
     return gr.update(choices=tables)
+# def get_data_df(schema):
+#     print('Getting Dataframe from the Database')
+#     return conn.sql(f"SELECT * FROM {schema} LIMIT 1000")
+@dlt.resource
+def fetch_data(schema):
+    result = conn.sql(f"SELECT * FROM {schema} LIMIT 1000")
+    while True:
+        chunk_df = result.fetch_df_chunk(2)
+        if chunk_df is None or len(chunk_df) == 0:
+            break
+        else:
+            yield chunk_df
+def create_pipeline(schema):
+    dataset_name = schema.split('.')[1]
+    print("Dataset Name: ", dataset_name)
+    table_name = schema.split('.')[2]
+    print("Table Name: ", table_name)
+    pipeline =dlt.pipeline(
+        pipeline_name='duckdb_pipeline',
+        destination='duckdb',
+        dataset_name= dataset_name,
+    )
+    load_info = pipeline.run(fetch_data(schema), table_name = table_name,
+                             write_disposition = "replace")
+    print(load_info)
+    return dataset_name + "." + table_name
+def load_pipeline(table_name):
+    _conn = duckdb.connect("duckdb_pipeline.duckdb")
+    return _conn.sql(f"SELECT * FROM {table_name} LIMIT 1000").df()
 def df_summary(df):
     summary = []
 # Main Function
 def main(table):
     schema = get_table_schema(table)
+    # Create dlt pipeline
+    table_name = create_pipeline(schema)
+    # Load dlt pipeline
+    df = load_pipeline(table_name)
+    # df = get_data_df(schema)
     df_statistics, df_alerts = statistics(df)
     describe_num, describe_cat  = describe(df)
 def user_results(table, text_query):
     schema = get_table_schema(table)
+    # Create dlt pipeline
+    table_name = create_pipeline(schema)
+    # Load dlt pipeline
+    df = load_pipeline(table_name)
     messages = format_user_prompt(df=df, user_description=text_query)
     tests = run_llm(messages)

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ pandera==0.20.4
 ydata-profiling==v4.11.0
 langchain-core==0.3.12
 langchain-huggingface
-langchain==0.3.4

 ydata-profiling==v4.11.0
 langchain-core==0.3.12
 langchain-huggingface
+langchain==0.3.4
+dlt==1.3.0