Spaces:
Sleeping
Sleeping
File size: 5,830 Bytes
0e13b2c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
import numpy as np
import pandas as pd
# -----------------Numerical Statistics-----------------
def format_values(key, value):
if not isinstance(value, (int, float)):
# if value is a time
return str(value)
if "Memory" in key:
# for memory usage
ind = 0
unit = dict(enumerate(["B", "KB", "MB", "GB", "TB"], 0))
while value > 1024:
value /= 1024
ind += 1
return f"{value:.1f} {unit[ind]}"
if (value * 10) % 10 == 0:
# if value is int but in a float form with 0 at last digit
value = int(value)
if abs(value) >= 1000000:
return f"{value:.5g}"
elif abs(value) >= 1000000 or abs(value) < 0.001:
value = f"{value:.5g}"
elif abs(value) >= 1:
# eliminate trailing zeros
pre_value = float(f"{value:.4f}")
value = int(pre_value) if (pre_value * 10) % 10 == 0 else pre_value
elif 0.001 <= abs(value) < 1:
value = f"{value:.4g}"
else:
value = str(value)
if "%" in key:
# for percentage, only use digits before notation sign for extreme small number
value = f"{float(value):.1%}"
return str(value)
def format_num_stats(data):
"""
Format numerical statistics
"""
overview = {
"Approximate Distinct Count": data["nuniq"],
"Approximate Unique (%)": data["nuniq"] / data["npres"],
"Missing": data["nrows"] - data["npres"],
"Missing (%)": 1 - (data["npres"] / data["nrows"]),
"Infinite": (data["npres"] - data["nreals"]),
"Infinite (%)": (data["npres"] - data["nreals"]) / data["nrows"],
"Memory Size": data["mem_use"],
"Mean": data["mean"],
"Minimum": data["min"],
"Maximum": data["max"],
"Zeros": data["nzero"],
"Zeros (%)": data["nzero"] / data["nrows"],
"Negatives": data["nneg"],
"Negatives (%)": data["nneg"] / data["nrows"],
}
data["qntls"].index = np.round(data["qntls"].index, 2)
quantile = {
"Minimum": data["min"],
"5-th Percentile": data["qntls"].loc[0.05],
"Q1": data["qntls"].loc[0.25],
"Median": data["qntls"].loc[0.50],
"Q3": data["qntls"].loc[0.75],
"95-th Percentile": data["qntls"].loc[0.95],
"Maximum": data["max"],
"Range": data["max"] - data["min"],
"IQR": data["qntls"].loc[0.75] - data["qntls"].loc[0.25],
}
descriptive = {
"Mean": data["mean"],
"Standard Deviation": data["std"],
"Variance": data["std"] ** 2,
"Sum": data["mean"] * data["npres"],
"Skewness": float(data["skew"]),
"Kurtosis": float(data["kurt"]),
"Coefficient of Variation": data["std"] / data["mean"] if data["mean"] != 0 else np.nan,
}
# return {
# "Overview": {k: _format_values(k, v) for k, v in overview.items()},
# # "Quantile Statistics": {k: _format_values(k, v) for k, v in quantile.items()},
# # "Descriptive Statistics": {k: _format_values(k, v) for k, v in descriptive.items()},
# }
return {
"Overview": {**{k: format_values(k, v) for k, v in overview.items()},
**{k: format_values(k, v) for k, v in quantile.items()},
**{k: format_values(k, v) for k, v in descriptive.items()}}
}
# -----------------------------------------------------
# -----------------Categorical Statistics-----------------
def format_cat_stats(
data
):
"""
Format categorical statistics
"""
stats = data['stats']
len_stats = data['len_stats']
letter_stats = data["letter_stats"]
ov_stats = {
"Approximate Distinct Count": stats["nuniq"],
"Approximate Unique (%)": stats["nuniq"] / stats["npres"],
"Missing": stats["nrows"] - stats["npres"],
"Missing (%)": 1 - stats["npres"] / stats["nrows"],
"Memory Size": stats["mem_use"],
}
sampled_rows = ("1st row", "2nd row", "3rd row", "4th row", "5th row")
smpl = dict(zip(sampled_rows, stats["first_rows"]))
# return {
# "Overview": {k: _format_values(k, v) for k, v in ov_stats.items()},
# "Length": {k: _format_values(k, v) for k, v in len_stats.items()},
# "Sample": {k: f"{v[:18]}..." if len(v) > 18 else v for k, v in smpl.items()},
# "Letter": {k: _format_values(k, v) for k, v in letter_stats.items()},
# }
return {
"Overview": {**{k: format_values(k, v) for k, v in ov_stats.items()},
**{k: format_values(k, v) for k, v in len_stats.items()},
}
}
# -----------------------------------------------------
def format_ov_stats(stats) :
nrows, ncols, npresent_cells, nrows_wo_dups, mem_use, dtypes_cnt = stats.values()
ncells = nrows * ncols
data = {
"Number of Variables": ncols,
"Number of Rows": nrows,
"Missing Cells": float(ncells - npresent_cells),
"Missing Cells (%)": 1 - (npresent_cells / ncells),
"Duplicate Rows": nrows - nrows_wo_dups,
"Duplicate Rows (%)": 1 - (nrows_wo_dups / nrows),
"Total Size in Memory": float(mem_use),
"Average Row Size in Memory": mem_use / nrows,
}
return {k: format_values(k, v) for k, v in data.items()}, dtypes_cnt
def format_insights(data):
data_list = []
for key, value_list in data.items():
for item in value_list:
for category, description in item.items():
data_list.append({'Category': category, 'Description': description})
insights_df = pd.DataFrame(data_list)
insights_df['Description'] = insights_df['Description'].str.replace(r'/\*start\*/', '', regex=True)
insights_df['Description'] = insights_df['Description'].str.replace(r'/\*end\*/', '', regex=True)
return insights_df |