File size: 5,830 Bytes
0e13b2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162

import numpy as np
import pandas as pd
# -----------------Numerical Statistics-----------------
def format_values(key, value):

    if not isinstance(value, (int, float)):
        # if value is a time
        return str(value)

    if "Memory" in key:
        # for memory usage
        ind = 0
        unit = dict(enumerate(["B", "KB", "MB", "GB", "TB"], 0))
        while value > 1024:
            value /= 1024
            ind += 1
        return f"{value:.1f} {unit[ind]}"

    if (value * 10) % 10 == 0:
        # if value is int but in a float form with 0 at last digit
        value = int(value)
        if abs(value) >= 1000000:
            return f"{value:.5g}"
    elif abs(value) >= 1000000 or abs(value) < 0.001:
        value = f"{value:.5g}"
    elif abs(value) >= 1:
        # eliminate trailing zeros
        pre_value = float(f"{value:.4f}")
        value = int(pre_value) if (pre_value * 10) % 10 == 0 else pre_value
    elif 0.001 <= abs(value) < 1:
        value = f"{value:.4g}"
    else:
        value = str(value)

    if "%" in key:
        # for percentage, only use digits before notation sign for extreme small number
        value = f"{float(value):.1%}"
    return str(value)

def format_num_stats(data):
    """
    Format numerical statistics
    """
    overview = {
        "Approximate Distinct Count": data["nuniq"],
        "Approximate Unique (%)": data["nuniq"] / data["npres"],
        "Missing": data["nrows"] - data["npres"],
        "Missing (%)": 1 - (data["npres"] / data["nrows"]),
        "Infinite": (data["npres"] - data["nreals"]),
        "Infinite (%)": (data["npres"] - data["nreals"]) / data["nrows"],
        "Memory Size": data["mem_use"],
        "Mean": data["mean"],
        "Minimum": data["min"],
        "Maximum": data["max"],
        "Zeros": data["nzero"],
        "Zeros (%)": data["nzero"] / data["nrows"],
        "Negatives": data["nneg"],
        "Negatives (%)": data["nneg"] / data["nrows"],
    }
    data["qntls"].index = np.round(data["qntls"].index, 2)
    quantile = {
        "Minimum": data["min"],
        "5-th Percentile": data["qntls"].loc[0.05],
        "Q1": data["qntls"].loc[0.25],
        "Median": data["qntls"].loc[0.50],
        "Q3": data["qntls"].loc[0.75],
        "95-th Percentile": data["qntls"].loc[0.95],
        "Maximum": data["max"],
        "Range": data["max"] - data["min"],
        "IQR": data["qntls"].loc[0.75] - data["qntls"].loc[0.25],
    }
    descriptive = {
        "Mean": data["mean"],
        "Standard Deviation": data["std"],
        "Variance": data["std"] ** 2,
        "Sum": data["mean"] * data["npres"],
        "Skewness": float(data["skew"]),
        "Kurtosis": float(data["kurt"]),
        "Coefficient of Variation": data["std"] / data["mean"] if data["mean"] != 0 else np.nan,
    }

    # return {
    #     "Overview": {k: _format_values(k, v) for k, v in overview.items()},
    #     # "Quantile Statistics": {k: _format_values(k, v) for k, v in quantile.items()},
    #     # "Descriptive Statistics": {k: _format_values(k, v) for k, v in descriptive.items()},
    # }

    return {
    "Overview": {**{k: format_values(k, v) for k, v in overview.items()},
                 **{k: format_values(k, v) for k, v in quantile.items()},
                 **{k: format_values(k, v) for k, v in descriptive.items()}}
      }
# -----------------------------------------------------


# -----------------Categorical Statistics-----------------

def format_cat_stats(
    data
):
    """
    Format categorical statistics
    """
    stats = data['stats']
    len_stats = data['len_stats']
    letter_stats = data["letter_stats"]
    ov_stats = {
        "Approximate Distinct Count": stats["nuniq"],
        "Approximate Unique (%)": stats["nuniq"] / stats["npres"],
        "Missing": stats["nrows"] - stats["npres"],
        "Missing (%)": 1 - stats["npres"] / stats["nrows"],
        "Memory Size": stats["mem_use"],
    }
    sampled_rows = ("1st row", "2nd row", "3rd row", "4th row", "5th row")
    smpl = dict(zip(sampled_rows, stats["first_rows"]))

    # return {
    #     "Overview": {k: _format_values(k, v) for k, v in ov_stats.items()},
    #     "Length": {k: _format_values(k, v) for k, v in len_stats.items()},
    #     "Sample": {k: f"{v[:18]}..." if len(v) > 18 else v for k, v in smpl.items()},
    #     "Letter": {k: _format_values(k, v) for k, v in letter_stats.items()},
    # }
    return {
    "Overview": {**{k: format_values(k, v) for k, v in ov_stats.items()},
                 **{k: format_values(k, v) for k, v in len_stats.items()},
      }
    }
# -----------------------------------------------------


def format_ov_stats(stats) :

    nrows, ncols, npresent_cells, nrows_wo_dups, mem_use, dtypes_cnt = stats.values()
    ncells = nrows * ncols

    data = {
        "Number of Variables": ncols,
        "Number of Rows": nrows,
        "Missing Cells": float(ncells - npresent_cells),
        "Missing Cells (%)": 1 - (npresent_cells / ncells),
        "Duplicate Rows": nrows - nrows_wo_dups,
        "Duplicate Rows (%)": 1 - (nrows_wo_dups / nrows),
        "Total Size in Memory": float(mem_use),
        "Average Row Size in Memory": mem_use / nrows,
    }
    return {k: format_values(k, v) for k, v in data.items()}, dtypes_cnt


def format_insights(data):
    data_list = []
    for key, value_list in data.items():
        for item in value_list:
            for category, description in item.items():
                data_list.append({'Category': category, 'Description': description})

    insights_df = pd.DataFrame(data_list)

    insights_df['Description'] = insights_df['Description'].str.replace(r'/\*start\*/', '', regex=True)
    insights_df['Description'] = insights_df['Description'].str.replace(r'/\*end\*/', '', regex=True)

    return insights_df