import numpy as np import pandas as pd # -----------------Numerical Statistics----------------- def format_values(key, value): if not isinstance(value, (int, float)): # if value is a time return str(value) if "Memory" in key: # for memory usage ind = 0 unit = dict(enumerate(["B", "KB", "MB", "GB", "TB"], 0)) while value > 1024: value /= 1024 ind += 1 return f"{value:.1f} {unit[ind]}" if (value * 10) % 10 == 0: # if value is int but in a float form with 0 at last digit value = int(value) if abs(value) >= 1000000: return f"{value:.5g}" elif abs(value) >= 1000000 or abs(value) < 0.001: value = f"{value:.5g}" elif abs(value) >= 1: # eliminate trailing zeros pre_value = float(f"{value:.4f}") value = int(pre_value) if (pre_value * 10) % 10 == 0 else pre_value elif 0.001 <= abs(value) < 1: value = f"{value:.4g}" else: value = str(value) if "%" in key: # for percentage, only use digits before notation sign for extreme small number value = f"{float(value):.1%}" return str(value) def format_num_stats(data): """ Format numerical statistics """ overview = { "Approximate Distinct Count": data["nuniq"], "Approximate Unique (%)": data["nuniq"] / data["npres"], "Missing": data["nrows"] - data["npres"], "Missing (%)": 1 - (data["npres"] / data["nrows"]), "Infinite": (data["npres"] - data["nreals"]), "Infinite (%)": (data["npres"] - data["nreals"]) / data["nrows"], "Memory Size": data["mem_use"], "Mean": data["mean"], "Minimum": data["min"], "Maximum": data["max"], "Zeros": data["nzero"], "Zeros (%)": data["nzero"] / data["nrows"], "Negatives": data["nneg"], "Negatives (%)": data["nneg"] / data["nrows"], } data["qntls"].index = np.round(data["qntls"].index, 2) quantile = { "Minimum": data["min"], "5-th Percentile": data["qntls"].loc[0.05], "Q1": data["qntls"].loc[0.25], "Median": data["qntls"].loc[0.50], "Q3": data["qntls"].loc[0.75], "95-th Percentile": data["qntls"].loc[0.95], "Maximum": data["max"], "Range": data["max"] - data["min"], "IQR": data["qntls"].loc[0.75] - data["qntls"].loc[0.25], } descriptive = { "Mean": data["mean"], "Standard Deviation": data["std"], "Variance": data["std"] ** 2, "Sum": data["mean"] * data["npres"], "Skewness": float(data["skew"]), "Kurtosis": float(data["kurt"]), "Coefficient of Variation": data["std"] / data["mean"] if data["mean"] != 0 else np.nan, } # return { # "Overview": {k: _format_values(k, v) for k, v in overview.items()}, # # "Quantile Statistics": {k: _format_values(k, v) for k, v in quantile.items()}, # # "Descriptive Statistics": {k: _format_values(k, v) for k, v in descriptive.items()}, # } return { "Overview": {**{k: format_values(k, v) for k, v in overview.items()}, **{k: format_values(k, v) for k, v in quantile.items()}, **{k: format_values(k, v) for k, v in descriptive.items()}} } # ----------------------------------------------------- # -----------------Categorical Statistics----------------- def format_cat_stats( data ): """ Format categorical statistics """ stats = data['stats'] len_stats = data['len_stats'] letter_stats = data["letter_stats"] ov_stats = { "Approximate Distinct Count": stats["nuniq"], "Approximate Unique (%)": stats["nuniq"] / stats["npres"], "Missing": stats["nrows"] - stats["npres"], "Missing (%)": 1 - stats["npres"] / stats["nrows"], "Memory Size": stats["mem_use"], } sampled_rows = ("1st row", "2nd row", "3rd row", "4th row", "5th row") smpl = dict(zip(sampled_rows, stats["first_rows"])) # return { # "Overview": {k: _format_values(k, v) for k, v in ov_stats.items()}, # "Length": {k: _format_values(k, v) for k, v in len_stats.items()}, # "Sample": {k: f"{v[:18]}..." if len(v) > 18 else v for k, v in smpl.items()}, # "Letter": {k: _format_values(k, v) for k, v in letter_stats.items()}, # } return { "Overview": {**{k: format_values(k, v) for k, v in ov_stats.items()}, **{k: format_values(k, v) for k, v in len_stats.items()}, } } # ----------------------------------------------------- def format_ov_stats(stats) : nrows, ncols, npresent_cells, nrows_wo_dups, mem_use, dtypes_cnt = stats.values() ncells = nrows * ncols data = { "Number of Variables": ncols, "Number of Rows": nrows, "Missing Cells": float(ncells - npresent_cells), "Missing Cells (%)": 1 - (npresent_cells / ncells), "Duplicate Rows": nrows - nrows_wo_dups, "Duplicate Rows (%)": 1 - (nrows_wo_dups / nrows), "Total Size in Memory": float(mem_use), "Average Row Size in Memory": mem_use / nrows, } return {k: format_values(k, v) for k, v in data.items()}, dtypes_cnt def format_insights(data): data_list = [] for key, value_list in data.items(): for item in value_list: for category, description in item.items(): data_list.append({'Category': category, 'Description': description}) insights_df = pd.DataFrame(data_list) insights_df['Description'] = insights_df['Description'].str.replace(r'/\*start\*/', '', regex=True) insights_df['Description'] = insights_df['Description'].str.replace(r'/\*end\*/', '', regex=True) return insights_df