Spaces:

datajoi
/

Dataset-Test-Workflow

Sleeping

App Files Files Community

Mustehson commited on Nov 11, 2024

Commit

0e13b2c

1 Parent(s): 7c2e7ac

Data Prep

Browse files

Files changed (3) hide show

app.py +73 -83
requirements.txt +2 -1
utils.py +162 -0

app.py CHANGED Viewed

@@ -5,8 +5,13 @@ import gradio as gr
 import pandas as pd
 import pandera as pa
 from pandera import Column
-import ydata_profiling as pp
 from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
 from langsmith import traceable
 from langchain import hub
 import warnings
@@ -38,7 +43,7 @@ for model in models:
       print(f"Error for model {model}: {e}")
       continue
-llm = ChatHuggingFace(llm=endpoint).bind_tools(tools=[], max_tokens=8192)
 #---------------------------------------
 #-----LOAD PROMPT FROM LANCHAIN HUB-----
@@ -65,37 +70,80 @@ def update_table_names(schema_name):
     tables = get_tables_names(schema_name)
     return gr.update(choices=tables)
 def get_data_df(schema):
     print('Getting Dataframe from the Database')
     return conn.sql(f"SELECT * FROM {schema} LIMIT 1000").df()
 def df_summary(df):
     summary = []
     for column in df.columns:
         if pd.api.types.is_numeric_dtype(df[column]):
             summary.append({
-                "column": column,
-                "max": df[column].max(),
-                "min": df[column].min(),
-                "count": df[column].count(),
-                "nunique": df[column].nunique(),
-                "dtype": str(df[column].dtype),
-                "top": None
             })
         elif pd.api.types.is_categorical_dtype(df[column]) or pd.api.types.is_object_dtype(df[column]):
             top_value = df[column].mode().iloc[0] if not df[column].mode().empty else None
             summary.append({
-                "column": column,
-                "max": None,
-                "min": None,
-                "count": df[column].count(),
-                "nunique": df[column].nunique(),
-                "dtype": str(df[column].dtype),
-                "top": top_value
             })
     summary_df = pd.DataFrame(summary)
     return summary_df.reset_index(drop=True)
@@ -119,33 +167,6 @@ def run_llm(messages):
   return tests
-# Get Schema
-def get_table_schema(table):
-    result = conn.sql(f"SELECT sql, database_name, schema_name FROM duckdb_tables() where table_name ='{table}';").df()
-    ddl_create = result.iloc[0,0]
-    parent_database = result.iloc[0,1]
-    schema_name = result.iloc[0,2]
-    full_path = f"{parent_database}.{schema_name}.{table}"
-    if schema_name != "main":
-        old_path = f"{schema_name}.{table}"
-    else:
-        old_path = table
-    ddl_create = ddl_create.replace(old_path, full_path)
-    return full_path
-def describe(df):
-    numerical_info = pd.DataFrame()
-    categorical_info = pd.DataFrame()
-    if len(df.select_dtypes(include=['number']).columns) >= 1:
-        numerical_info = df.select_dtypes(include=['number']).describe().T.reset_index()
-        numerical_info.rename(columns={'index': 'column'}, inplace=True)
-    if len(df.select_dtypes(include=['object']).columns) >= 1:
-        categorical_info = df.select_dtypes(include=['object']).describe().T.reset_index()
-        categorical_info.rename(columns={'index': 'column'}, inplace=True)
-    return numerical_info, categorical_info
 def validate_pandera(tests, df):
     validation_results = []
@@ -165,41 +186,6 @@ def validate_pandera(tests, df):
             })
     return pd.DataFrame(validation_results)
-def statistics(df):
-    profile = pp.ProfileReport(df)
-    report_dict = profile.get_description()
-    description, alerts = report_dict.table, report_dict.alerts
-    # Statistics
-    mapping = {
-        'n': 'Number of observations',
-        'n_var': 'Number of variables',
-        'n_cells_missing': 'Number of cells missing',
-        'n_vars_with_missing': 'Number of columns with missing data',
-        'n_vars_all_missing': 'Columns with all missing data',
-        'p_cells_missing': 'Missing cells (%)',
-        'n_duplicates': 'Duplicated rows',
-        'p_duplicates': 'Duplicated rows (%)',
-    }
-    updated_data = {mapping.get(k, k): v for k, v in description.items() if k != 'types'}
-    # Add flattened types information
-    if 'Text' in description.get('types', {}):
-            updated_data['Number of text columns'] = description['types']['Text']
-    if 'Categorical' in description.get('types', {}):
-        updated_data['Number of categorical columns'] = description['types']['Categorical']
-    if 'Numeric' in description.get('types', {}):
-        updated_data['Number of numeric columns'] = description['types']['Numeric']
-    if 'DateTime' in description.get('types', {}):
-        updated_data['Number of datetime columns'] = description['types']['DateTime']
-    df_statistics = pd.DataFrame(list(updated_data.items()), columns=['Statistic Description', 'Value'])
-    df_statistics['Value'] = df_statistics['Value'].astype(int)
-    # Alerts
-    alerts_list = [(str(alert).replace('[', '').replace(']', ''), alert.alert_type_name) for alert in alerts]
-    df_alerts = pd.DataFrame(alerts_list, columns=['Data Quality Issue', 'Category'])
-    return df_statistics, df_alerts
 #---------------------------------------
@@ -208,22 +194,26 @@ def statistics(df):
 def main(table):
     schema = get_table_schema(table)
     df = get_data_df(schema)
-    df_statistics, df_alerts = statistics(df)
-    describe_num, describe_cat  = describe(df)
     messages = format_prompt(df=df)
     tests = run_llm(messages)
     print(tests)
     if isinstance(tests, Exception):
         tests = pd.DataFrame([{"error": f"❌ Unable to generate tests. {tests}"}])
-        return df.head(10), df_statistics, df_alerts, describe_cat, describe_num, tests, pd.DataFrame([])
     tests_df = pd.DataFrame(tests)
     tests_df.rename(columns={tests_df.columns[0]: 'Column', tests_df.columns[1]: 'Rule Name', tests_df.columns[2]: 'Rules' }, inplace=True)
     pandera_results = validate_pandera(tests, df)
-    return df.head(10), df_statistics, df_alerts, describe_cat, describe_num, tests_df, pandera_results
 def user_results(table, text_query):

 import pandas as pd
 import pandera as pa
 from pandera import Column
+import random
+from dataprep.eda import compute
 from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
+from .utils import (
+    format_num_stats, format_cat_stats,
+    format_ov_stats, format_insights
+)
 from langsmith import traceable
 from langchain import hub
 import warnings
       print(f"Error for model {model}: {e}")
       continue
+llm = ChatHuggingFace(llm=endpoint).bind(max_tokens=4096)
 #---------------------------------------
 #-----LOAD PROMPT FROM LANCHAIN HUB-----
     tables = get_tables_names(schema_name)
     return gr.update(choices=tables)
+# Get Schema
+def get_table_schema(table):
+    result = conn.sql(f"SELECT sql, database_name, schema_name FROM duckdb_tables() where table_name ='{table}';").df()
+    ddl_create = result.iloc[0,0]
+    parent_database = result.iloc[0,1]
+    schema_name = result.iloc[0,2]
+    full_path = f"{parent_database}.{schema_name}.{table}"
+    if schema_name != "main":
+        old_path = f"{schema_name}.{table}"
+    else:
+        old_path = table
+    ddl_create = ddl_create.replace(old_path, full_path)
+    return full_path
 def get_data_df(schema):
     print('Getting Dataframe from the Database')
     return conn.sql(f"SELECT * FROM {schema} LIMIT 1000").df()
+def calcualte_stats(df):
+    indev_stats = []
+    cols = []
+    _df = df.copy()
+    num_cols = _df.select_dtypes(include=['number'], exclude=['datetime']).columns
+    cat_cols = _df.select_dtypes(include=['object'], exclude=['datetime']).columns
+    _all_stats = compute(_df)
+    all_stats = format_ov_stats(_all_stats['stats'])
+    insights = format_insights(_all_stats['overview_insights'])
+    for i, col in enumerate(random.sample(num_cols.tolist()+cat_cols.tolist(), 2)):
+        _indv_data = compute(_df, col)
+        if col in cat_cols:
+            indev_data_cat = format_cat_stats(_indv_data["data"])
+            indev_stats.append(pd.DataFrame([indev_data_cat['Overview']], index=[f'{col}_stats']).T)
+        elif col in num_cols:
+            try:
+                indev_data_num = format_num_stats(_indv_data["data"])
+            except:
+                indev_data_num = format_cat_stats(_indv_data["data"])
+        indev_stats.append(pd.DataFrame([indev_data_num['Overview']], index=[f'{col}_stats']).T)
+    return {
+        "overall_stats": pd.DataFrame(all_stats[0], index=['Dataset Statistics']).T,
+        "insights": insights,
+        "stats_1": indev_stats[0],
+        "stats_2": indev_stats[1]
+    }
 def df_summary(df):
     summary = []
     for column in df.columns:
         if pd.api.types.is_numeric_dtype(df[column]):
             summary.append({
+                "column": column, "max": df[column].max(), "min": df[column].min(),
+                "count": df[column].count(), "nunique": df[column].nunique(),
+                "dtype": str(df[column].dtype), "top": None
             })
         elif pd.api.types.is_categorical_dtype(df[column]) or pd.api.types.is_object_dtype(df[column]):
             top_value = df[column].mode().iloc[0] if not df[column].mode().empty else None
             summary.append({
+                "column": column, "max": None, "min": None, "count": df[column].count(),
+                "nunique": df[column].nunique(), "dtype": str(df[column].dtype), "top": top_value
             })
     summary_df = pd.DataFrame(summary)
     return summary_df.reset_index(drop=True)
   return tests
 def validate_pandera(tests, df):
     validation_results = []
             })
     return pd.DataFrame(validation_results)
 #---------------------------------------
 def main(table):
     schema = get_table_schema(table)
     df = get_data_df(schema)
     messages = format_prompt(df=df)
     tests = run_llm(messages)
     print(tests)
+    stats = calcualte_stats(df)
+    df_insights = stats['insights']
+    df_statistics = stats['overall_stats']
+    df_stat_1 = stats['stats_1']
+    df_stat_2 = stats['stats_2']
     if isinstance(tests, Exception):
         tests = pd.DataFrame([{"error": f"❌ Unable to generate tests. {tests}"}])
+        return df.head(10), df_statistics, df_insights, df_stat_1, df_stat_2, tests, pd.DataFrame([])
     tests_df = pd.DataFrame(tests)
     tests_df.rename(columns={tests_df.columns[0]: 'Column', tests_df.columns[1]: 'Rule Name', tests_df.columns[2]: 'Rules' }, inplace=True)
     pandera_results = validate_pandera(tests, df)
+    return df.head(10), df_statistics, df_insights, df_stat_1, df_stat_2, tests_df, pandera_results
 def user_results(table, text_query):

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ langsmith==0.1.135
 pandera==0.20.4
 ydata-profiling==v4.11.0
 langchain-core==0.3.12
-langchain==0.3.4

 pandera==0.20.4
 ydata-profiling==v4.11.0
 langchain-core==0.3.12
+langchain==0.3.4
+dataprep==0.4.4

utils.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import numpy as np
+import pandas as pd
+# -----------------Numerical Statistics-----------------
+def format_values(key, value):
+    if not isinstance(value, (int, float)):
+        # if value is a time
+        return str(value)
+    if "Memory" in key:
+        # for memory usage
+        ind = 0
+        unit = dict(enumerate(["B", "KB", "MB", "GB", "TB"], 0))
+        while value > 1024:
+            value /= 1024
+            ind += 1
+        return f"{value:.1f} {unit[ind]}"
+    if (value * 10) % 10 == 0:
+        # if value is int but in a float form with 0 at last digit
+        value = int(value)
+        if abs(value) >= 1000000:
+            return f"{value:.5g}"
+    elif abs(value) >= 1000000 or abs(value) < 0.001:
+        value = f"{value:.5g}"
+    elif abs(value) >= 1:
+        # eliminate trailing zeros
+        pre_value = float(f"{value:.4f}")
+        value = int(pre_value) if (pre_value * 10) % 10 == 0 else pre_value
+    elif 0.001 <= abs(value) < 1:
+        value = f"{value:.4g}"
+    else:
+        value = str(value)
+    if "%" in key:
+        # for percentage, only use digits before notation sign for extreme small number
+        value = f"{float(value):.1%}"
+    return str(value)
+def format_num_stats(data):
+    """
+    Format numerical statistics
+    """
+    overview = {
+        "Approximate Distinct Count": data["nuniq"],
+        "Approximate Unique (%)": data["nuniq"] / data["npres"],
+        "Missing": data["nrows"] - data["npres"],
+        "Missing (%)": 1 - (data["npres"] / data["nrows"]),
+        "Infinite": (data["npres"] - data["nreals"]),
+        "Infinite (%)": (data["npres"] - data["nreals"]) / data["nrows"],
+        "Memory Size": data["mem_use"],
+        "Mean": data["mean"],
+        "Minimum": data["min"],
+        "Maximum": data["max"],
+        "Zeros": data["nzero"],
+        "Zeros (%)": data["nzero"] / data["nrows"],
+        "Negatives": data["nneg"],
+        "Negatives (%)": data["nneg"] / data["nrows"],
+    }
+    data["qntls"].index = np.round(data["qntls"].index, 2)
+    quantile = {
+        "Minimum": data["min"],
+        "5-th Percentile": data["qntls"].loc[0.05],
+        "Q1": data["qntls"].loc[0.25],
+        "Median": data["qntls"].loc[0.50],
+        "Q3": data["qntls"].loc[0.75],
+        "95-th Percentile": data["qntls"].loc[0.95],
+        "Maximum": data["max"],
+        "Range": data["max"] - data["min"],
+        "IQR": data["qntls"].loc[0.75] - data["qntls"].loc[0.25],
+    }
+    descriptive = {
+        "Mean": data["mean"],
+        "Standard Deviation": data["std"],
+        "Variance": data["std"] ** 2,
+        "Sum": data["mean"] * data["npres"],
+        "Skewness": float(data["skew"]),
+        "Kurtosis": float(data["kurt"]),
+        "Coefficient of Variation": data["std"] / data["mean"] if data["mean"] != 0 else np.nan,
+    }
+    # return {
+    #     "Overview": {k: _format_values(k, v) for k, v in overview.items()},
+    #     # "Quantile Statistics": {k: _format_values(k, v) for k, v in quantile.items()},
+    #     # "Descriptive Statistics": {k: _format_values(k, v) for k, v in descriptive.items()},
+    # }
+    return {
+    "Overview": {**{k: format_values(k, v) for k, v in overview.items()},
+                 **{k: format_values(k, v) for k, v in quantile.items()},
+                 **{k: format_values(k, v) for k, v in descriptive.items()}}
+      }
+# -----------------------------------------------------
+# -----------------Categorical Statistics-----------------
+def format_cat_stats(
+    data
+):
+    """
+    Format categorical statistics
+    """
+    stats = data['stats']
+    len_stats = data['len_stats']
+    letter_stats = data["letter_stats"]
+    ov_stats = {
+        "Approximate Distinct Count": stats["nuniq"],
+        "Approximate Unique (%)": stats["nuniq"] / stats["npres"],
+        "Missing": stats["nrows"] - stats["npres"],
+        "Missing (%)": 1 - stats["npres"] / stats["nrows"],
+        "Memory Size": stats["mem_use"],
+    }
+    sampled_rows = ("1st row", "2nd row", "3rd row", "4th row", "5th row")
+    smpl = dict(zip(sampled_rows, stats["first_rows"]))
+    # return {
+    #     "Overview": {k: _format_values(k, v) for k, v in ov_stats.items()},
+    #     "Length": {k: _format_values(k, v) for k, v in len_stats.items()},
+    #     "Sample": {k: f"{v[:18]}..." if len(v) > 18 else v for k, v in smpl.items()},
+    #     "Letter": {k: _format_values(k, v) for k, v in letter_stats.items()},
+    # }
+    return {
+    "Overview": {**{k: format_values(k, v) for k, v in ov_stats.items()},
+                 **{k: format_values(k, v) for k, v in len_stats.items()},
+      }
+    }
+# -----------------------------------------------------
+def format_ov_stats(stats) :
+    nrows, ncols, npresent_cells, nrows_wo_dups, mem_use, dtypes_cnt = stats.values()
+    ncells = nrows * ncols
+    data = {
+        "Number of Variables": ncols,
+        "Number of Rows": nrows,
+        "Missing Cells": float(ncells - npresent_cells),
+        "Missing Cells (%)": 1 - (npresent_cells / ncells),
+        "Duplicate Rows": nrows - nrows_wo_dups,
+        "Duplicate Rows (%)": 1 - (nrows_wo_dups / nrows),
+        "Total Size in Memory": float(mem_use),
+        "Average Row Size in Memory": mem_use / nrows,
+    }
+    return {k: format_values(k, v) for k, v in data.items()}, dtypes_cnt
+def format_insights(data):
+    data_list = []
+    for key, value_list in data.items():
+        for item in value_list:
+            for category, description in item.items():
+                data_list.append({'Category': category, 'Description': description})
+    insights_df = pd.DataFrame(data_list)
+    insights_df['Description'] = insights_df['Description'].str.replace(r'/\*start\*/', '', regex=True)
+    insights_df['Description'] = insights_df['Description'].str.replace(r'/\*end\*/', '', regex=True)
+    return insights_df