v6Mastercardapp / pages /6_AI_Model_Results.py
BlendMMM's picture
Upload 73 files
3b48627
raw
history blame
25.6 kB
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import streamlit as st
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_percentage_error
import sys
import os
from utilities import set_header, load_local_css, load_authenticator
import seaborn as sns
import matplotlib.pyplot as plt
import sweetviz as sv
import tempfile
from sklearn.preprocessing import MinMaxScaler
from st_aggrid import AgGrid
from st_aggrid import GridOptionsBuilder, GridUpdateMode
from st_aggrid import GridOptionsBuilder
import sys
import re
import pickle
from sklearn.metrics import r2_score, mean_absolute_percentage_error
from Data_prep_functions import plot_actual_vs_predicted
import sqlite3
from utilities import update_db
sys.setrecursionlimit(10**6)
original_stdout = sys.stdout
sys.stdout = open("temp_stdout.txt", "w")
sys.stdout.close()
sys.stdout = original_stdout
st.set_page_config(layout="wide")
load_local_css("styles.css")
set_header()
# TODO :
## 1. Add non panel model support
## 2. EDA Function
for k, v in st.session_state.items():
if k not in ["logout", "login", "config"] and not k.startswith(
"FormSubmitter"
):
st.session_state[k] = v
authenticator = st.session_state.get("authenticator")
if authenticator is None:
authenticator = load_authenticator()
name, authentication_status, username = authenticator.login("Login", "main")
auth_status = st.session_state.get("authentication_status")
if auth_status == True:
is_state_initiaized = st.session_state.get("initialized", False)
if not is_state_initiaized:
if "session_name" not in st.session_state:
st.session_state["session_name"] = None
if "project_dct" not in st.session_state:
st.error("Please load a project from Home page")
st.stop()
conn = sqlite3.connect(
r"DB/User.db", check_same_thread=False
) # connection with sql db
c = conn.cursor()
if not os.path.exists(
os.path.join(st.session_state["project_path"], "tuned_model.pkl")
):
st.error("Please save a tuned model")
st.stop()
if (
"session_state_saved"
in st.session_state["project_dct"]["model_tuning"].keys()
and st.session_state["project_dct"]["model_tuning"][
"session_state_saved"
]
!= []
):
for key in ["used_response_metrics", "media_data", "bin_dict"]:
if key not in st.session_state:
st.session_state[key] = st.session_state["project_dct"][
"model_tuning"
]["session_state_saved"][key]
st.session_state["bin_dict"] = st.session_state["project_dct"][
"model_build"
]["session_state_saved"]["bin_dict"]
media_data = st.session_state["media_data"]
panel_col = [
col.lower()
.replace(".", "_")
.replace("@", "_")
.replace(" ", "_")
.replace("-", "")
.replace(":", "")
.replace("__", "_")
for col in st.session_state["bin_dict"]["Panel Level 1"]
][
0
] # set the panel column
is_panel = True if len(panel_col) > 0 else False
date_col = "date"
def plot_residual_predicted(actual, predicted, df_):
df_["Residuals"] = actual - pd.Series(predicted)
df_["StdResidual"] = (
df_["Residuals"] - df_["Residuals"].mean()
) / df_["Residuals"].std()
# Create a Plotly scatter plot
fig = px.scatter(
df_,
x=predicted,
y="StdResidual",
opacity=0.5,
color_discrete_sequence=["#11B6BD"],
)
# Add horizontal lines
fig.add_hline(y=0, line_dash="dash", line_color="darkorange")
fig.add_hline(y=2, line_color="red")
fig.add_hline(y=-2, line_color="red")
fig.update_xaxes(title="Predicted")
fig.update_yaxes(title="Standardized Residuals (Actual - Predicted)")
# Set the same width and height for both figures
fig.update_layout(
title="Residuals over Predicted Values",
autosize=False,
width=600,
height=400,
)
return fig
def residual_distribution(actual, predicted):
Residuals = actual - pd.Series(predicted)
# Create a Seaborn distribution plot
sns.set(style="whitegrid")
plt.figure(figsize=(6, 4))
sns.histplot(Residuals, kde=True, color="#11B6BD")
plt.title(" Distribution of Residuals")
plt.xlabel("Residuals")
plt.ylabel("Probability Density")
return plt
def qqplot(actual, predicted):
Residuals = actual - pd.Series(predicted)
Residuals = pd.Series(Residuals)
Resud_std = (Residuals - Residuals.mean()) / Residuals.std()
# Create a QQ plot using Plotly with custom colors
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=sm.ProbPlot(Resud_std).theoretical_quantiles,
y=sm.ProbPlot(Resud_std).sample_quantiles,
mode="markers",
marker=dict(size=5, color="#11B6BD"),
name="QQ Plot",
)
)
# Add the 45-degree reference line
diagonal_line = go.Scatter(
x=[
-2,
2,
], # Adjust the x values as needed to fit the range of your data
y=[-2, 2], # Adjust the y values accordingly
mode="lines",
line=dict(color="red"), # Customize the line color and style
name=" ",
)
fig.add_trace(diagonal_line)
# Customize the layout
fig.update_layout(
title="QQ Plot of Residuals",
title_x=0.5,
autosize=False,
width=600,
height=400,
xaxis_title="Theoretical Quantiles",
yaxis_title="Sample Quantiles",
)
return fig
def get_random_effects(media_data, panel_col, mdf):
random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"])
for i, market in enumerate(media_data[panel_col].unique()):
print(i, end="\r")
intercept = mdf.random_effects[market].values[0]
random_eff_df.loc[i, "random_effect"] = intercept
random_eff_df.loc[i, panel_col] = market
return random_eff_df
def mdf_predict(X_df, mdf, random_eff_df):
X = X_df.copy()
X = pd.merge(
X,
random_eff_df[[panel_col, "random_effect"]],
on=panel_col,
how="left",
)
X["pred_fixed_effect"] = mdf.predict(X)
X["pred"] = X["pred_fixed_effect"] + X["random_effect"]
X.drop(columns=["pred_fixed_effect", "random_effect"], inplace=True)
return X
def metrics_df_panel(model_dict):
metrics_df = pd.DataFrame(
columns=[
"Model",
"R2",
"ADJR2",
"Train Mape",
"Test Mape",
"Summary",
"Model_object",
]
)
i = 0
for key in model_dict.keys():
target = key.split("__")[1]
metrics_df.at[i, "Model"] = target
y = model_dict[key]["X_train_tuned"][target]
random_df = get_random_effects(
media_data, panel_col, model_dict[key]["Model_object"]
)
pred = mdf_predict(
model_dict[key]["X_train_tuned"],
model_dict[key]["Model_object"],
random_df,
)["pred"]
ytest = model_dict[key]["X_test_tuned"][target]
predtest = mdf_predict(
model_dict[key]["X_test_tuned"],
model_dict[key]["Model_object"],
random_df,
)["pred"]
metrics_df.at[i, "R2"] = r2_score(y, pred)
metrics_df.at[i, "ADJR2"] = 1 - (1 - metrics_df.loc[i, "R2"]) * (
len(y) - 1
) / (len(y) - len(model_dict[key]["feature_set"]) - 1)
metrics_df.at[i, "Train Mape"] = mean_absolute_percentage_error(
y, pred
)
metrics_df.at[i, "Test Mape"] = mean_absolute_percentage_error(
ytest, predtest
)
metrics_df.at[i, "Summary"] = model_dict[key][
"Model_object"
].summary()
metrics_df.at[i, "Model_object"] = model_dict[key]["Model_object"]
i += 1
metrics_df = np.round(metrics_df, 2)
return metrics_df
with open(
os.path.join(
st.session_state["project_path"], "final_df_transformed.pkl"
),
"rb",
) as f:
data = pickle.load(f)
transformed_data = data["final_df_transformed"]
with open(
os.path.join(st.session_state["project_path"], "data_import.pkl"), "rb"
) as f:
data = pickle.load(f)
st.session_state["bin_dict"] = data["bin_dict"]
with open(
os.path.join(st.session_state["project_path"], "tuned_model.pkl"), "rb"
) as file:
tuned_model_dict = pickle.load(file)
feature_set_dct = {
key.split("__")[1]: key_dict["feature_set"]
for key, key_dict in tuned_model_dict.items()
}
# """ the above part should be modified so that we are fetching features set from the saved model"""
def contributions(X, model, target):
X1 = X.copy()
for j, col in enumerate(X1.columns):
X1[col] = X1[col] * model.params.values[j]
contributions = np.round(
(X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2
)
contributions = (
pd.DataFrame(contributions, columns=target)
.reset_index()
.rename(columns={"index": "Channel"})
)
contributions["Channel"] = [
re.split(r"_imp|_cli", col)[0] for col in contributions["Channel"]
]
return contributions
if "contribution_df" not in st.session_state:
st.session_state["contribution_df"] = None
def contributions_panel(model_dict):
media_data = st.session_state["media_data"]
contribution_df = pd.DataFrame(columns=["Channel"])
for key in model_dict.keys():
best_feature_set = model_dict[key]["feature_set"]
model = model_dict[key]["Model_object"]
target = key.split("__")[1]
X_train = model_dict[key]["X_train_tuned"]
contri_df = pd.DataFrame()
y = []
y_pred = []
random_eff_df = get_random_effects(media_data, panel_col, model)
random_eff_df["fixed_effect"] = model.fe_params["Intercept"]
random_eff_df["panel_effect"] = (
random_eff_df["random_effect"] + random_eff_df["fixed_effect"]
)
coef_df = pd.DataFrame(model.fe_params)
coef_df.reset_index(inplace=True)
coef_df.columns = ["feature", "coef"]
x_train_contribution = X_train.copy()
x_train_contribution = mdf_predict(
x_train_contribution, model, random_eff_df
)
x_train_contribution = pd.merge(
x_train_contribution,
random_eff_df[[panel_col, "panel_effect"]],
on=panel_col,
how="left",
)
for i in range(len(coef_df))[1:]:
coef = coef_df.loc[i, "coef"]
col = coef_df.loc[i, "feature"]
x_train_contribution[str(col) + "_contr"] = (
coef * x_train_contribution[col]
)
# x_train_contribution['sum_contributions'] = x_train_contribution.filter(regex="contr").sum(axis=1)
# x_train_contribution['sum_contributions'] = x_train_contribution['sum_contributions'] + x_train_contribution[
# 'panel_effect']
base_cols = ["panel_effect"] + [
c
for c in x_train_contribution.filter(regex="contr").columns
if c
in [
"Week_number_contr",
"Trend_contr",
"sine_wave_contr",
"cosine_wave_contr",
]
]
x_train_contribution["base_contr"] = x_train_contribution[
base_cols
].sum(axis=1)
x_train_contribution.drop(columns=base_cols, inplace=True)
# x_train_contribution.to_csv("Test/smr_x_train_contribution.csv", index=False)
contri_df = pd.DataFrame(
x_train_contribution.filter(regex="contr").sum(axis=0)
)
contri_df.reset_index(inplace=True)
contri_df.columns = ["Channel", target]
contri_df["Channel"] = (
contri_df["Channel"]
.str.split("(_impres|_clicks)")
.apply(lambda c: c[0])
)
contri_df[target] = (
100 * contri_df[target] / contri_df[target].sum()
)
contri_df["Channel"].replace("base_contr", "base", inplace=True)
contribution_df = pd.merge(
contribution_df, contri_df, on="Channel", how="outer"
)
# st.session_state["contribution_df"] = contributions_panel(tuned_model_dict)
return contribution_df
metrics_table = metrics_df_panel(tuned_model_dict)
eda_columns = st.columns(2)
with eda_columns[1]:
eda = st.button(
"Generate EDA Report",
help="Click to generate a bivariate report for the selected response metric from the table below.",
)
# st.markdown('Model Metrics')
st.title("Contribution Overview")
options = st.session_state["used_response_metrics"]
options = [
opt.lower()
.replace(" ", "_")
.replace("-", "")
.replace(":", "")
.replace("__", "_")
for opt in options
]
default_options = (
st.session_state["project_dct"]["saved_model_results"].get(
"selected_options"
)
if st.session_state["project_dct"]["saved_model_results"].get(
"selected_options"
)
is not None
else [options[-1]]
)
for i in default_options:
if i not in options:
st.write(i)
default_options.remove(i)
contribution_selections = st.multiselect(
"Select the Response Metrics to compare contributions",
options,
default=default_options,
)
trace_data = []
st.session_state["contribution_df"] = contributions_panel(tuned_model_dict)
for selection in contribution_selections:
trace = go.Bar(
x=st.session_state["contribution_df"]["Channel"],
y=st.session_state["contribution_df"][selection],
name=selection,
text=np.round(st.session_state["contribution_df"][selection], 0)
.astype(int)
.astype(str)
+ "%",
textposition="outside",
)
trace_data.append(trace)
layout = go.Layout(
title="Metrics Contribution by Channel",
xaxis=dict(title="Channel Name"),
yaxis=dict(title="Metrics Contribution"),
barmode="group",
)
fig = go.Figure(data=trace_data, layout=layout)
st.plotly_chart(fig, use_container_width=True)
############################################ Waterfall Chart ############################################
# import plotly.graph_objects as go
# # Initialize a Plotly figure
# fig = go.Figure()
# for selection in contribution_selections:
# # Ensure y_values are numeric
# y_values = st.session_state["contribution_df"][selection].values.astype(float)
# # Generating text labels for each bar, ensuring operations are compatible with string formats
# text_values = [f"{val}%" for val in np.round(y_values, 0).astype(int)]
# fig.add_trace(
# go.Waterfall(
# name=selection,
# orientation="v",
# measure=["relative"]
# * len(y_values), # Adjust if you have absolute values at certain points
# x=st.session_state["contribution_df"]["Channel"].tolist(),
# text=text_values,
# textposition="outside",
# y=y_values,
# increasing={"marker": {"color": "green"}},
# decreasing={"marker": {"color": "red"}},
# totals={"marker": {"color": "blue"}},
# )
# )
# fig.update_layout(
# title="Metrics Contribution by Channel",
# xaxis={"title": "Channel Name"},
# yaxis={"title": "Metrics Contribution"},
# height=600,
# )
# # Displaying the waterfall chart in Streamlit
# st.plotly_chart(fig, use_container_width=True)
import plotly.graph_objects as go
# Initialize a Plotly figure
fig = go.Figure()
for selection in contribution_selections:
# Ensure contributions are numeric
contributions = (
st.session_state["contribution_df"][selection]
.values.astype(float)
.tolist()
)
channel_names = st.session_state["contribution_df"]["Channel"].tolist()
display_name, display_contribution, base_contribution = [], [], 0
for channel_name, contribution in zip(channel_names, contributions):
if channel_name != "const" and channel_name != "base":
display_name.append(channel_name)
display_contribution.append(contribution)
else:
base_contribution = contribution
display_name = ["Base Sales"] + display_name
display_contribution = [base_contribution] + display_contribution
# Generating text labels for each bar, ensuring operations are compatible with string formats
text_values = [
f"{val}%" for val in np.round(display_contribution, 0).astype(int)
]
fig.add_trace(
go.Waterfall(
orientation="v",
measure=["relative"]
* len(
display_contribution
), # Adjust if you have absolute values at certain points
x=display_name,
text=text_values,
textposition="outside",
y=display_contribution,
increasing={"marker": {"color": "green"}},
decreasing={"marker": {"color": "red"}},
totals={"marker": {"color": "blue"}},
)
)
fig.update_layout(
title="Metrics Contribution by Channel",
xaxis={"title": "Channel Name"},
yaxis={"title": "Metrics Contribution"},
height=600,
)
# Displaying the waterfall chart in Streamlit
st.plotly_chart(fig, use_container_width=True)
############################################ Waterfall Chart ############################################
st.title("Analysis of Models Result")
# st.markdown()
previous_selection = st.session_state["project_dct"][
"saved_model_results"
].get("model_grid_sel", [1])
st.write(np.round(metrics_table, 2))
gd_table = metrics_table.iloc[:, :-2]
gd = GridOptionsBuilder.from_dataframe(gd_table)
# gd.configure_pagination(enabled=True)
gd.configure_selection(
use_checkbox=True,
selection_mode="single",
pre_select_all_rows=False,
pre_selected_rows=previous_selection,
)
gridoptions = gd.build()
table = AgGrid(
gd_table,
gridOptions=gridoptions,
fit_columns_on_grid_load=True,
height=200,
)
# table=metrics_table.iloc[:,:-2]
# table.insert(0, "Select", False)
# selection_table=st.data_editor(table,column_config={"Select": st.column_config.CheckboxColumn(required=True)})
if len(table.selected_rows) > 0:
st.session_state["project_dct"]["saved_model_results"][
"model_grid_sel"
] = table.selected_rows[0]["_selectedRowNodeInfo"]["nodeRowIndex"]
if len(table.selected_rows) == 0:
st.warning(
"Click on the checkbox to view comprehensive results of the selected model."
)
st.stop()
else:
target_column = table.selected_rows[0]["Model"]
feature_set = feature_set_dct[target_column]
# with eda_columns[1]:
# if eda:
# def generate_report_with_target(channel_data, target_feature):
# report = sv.analyze(
# [channel_data, "Dataset"], target_feat=target_feature, verbose=False
# )
# temp_dir = tempfile.mkdtemp()
# report_path = os.path.join(temp_dir, "report.html")
# report.show_html(
# filepath=report_path, open_browser=False
# ) # Generate the report as an HTML file
# return report_path
#
# report_data = transformed_data[feature_set]
# report_data[target_column] = transformed_data[target_column]
# report_file = generate_report_with_target(report_data, target_column)
#
# if os.path.exists(report_file):
# with open(report_file, "rb") as f:
# st.download_button(
# label="Download EDA Report",
# data=f.read(),
# file_name="report.html",
# mime="text/html",
# )
# else:
# st.warning("Report generation failed. Unable to find the report file.")
model = metrics_table[metrics_table["Model"] == target_column][
"Model_object"
].iloc[0]
target = metrics_table[metrics_table["Model"] == target_column][
"Model"
].iloc[0]
st.header("Model Summary")
st.write(model.summary())
sel_dict = tuned_model_dict[
[k for k in tuned_model_dict.keys() if k.split("__")[1] == target][0]
]
X_train = sel_dict["X_train_tuned"]
y_train = X_train[target]
random_effects = get_random_effects(media_data, panel_col, model)
pred = mdf_predict(X_train, model, random_effects)["pred"]
X_test = sel_dict["X_test_tuned"]
y_test = X_test[target]
predtest = mdf_predict(X_test, model, random_effects)["pred"]
metrics_table_train, _, fig_train = plot_actual_vs_predicted(
X_train[date_col],
y_train,
pred,
model,
target_column=target_column,
flag=None,
repeat_all_years=False,
is_panel=is_panel,
)
metrics_table_test, _, fig_test = plot_actual_vs_predicted(
X_test[date_col],
y_test,
predtest,
model,
target_column=target_column,
flag=None,
repeat_all_years=False,
is_panel=is_panel,
)
metrics_table_train = metrics_table_train.set_index("Metric").transpose()
metrics_table_train.index = ["Train"]
metrics_table_test = metrics_table_test.set_index("Metric").transpose()
metrics_table_test.index = ["test"]
metrics_table = np.round(
pd.concat([metrics_table_train, metrics_table_test]), 2
)
st.markdown("Result Overview")
st.dataframe(np.round(metrics_table, 2), use_container_width=True)
st.subheader("Actual vs Predicted Plot Train")
st.plotly_chart(fig_train, use_container_width=True)
st.subheader("Actual vs Predicted Plot Test")
st.plotly_chart(fig_test, use_container_width=True)
st.markdown("## Residual Analysis")
columns = st.columns(2)
Xtrain1 = X_train.copy()
with columns[0]:
fig = plot_residual_predicted(y_train, model.predict(Xtrain1), Xtrain1)
st.plotly_chart(fig)
with columns[1]:
st.empty()
fig = qqplot(y_train, model.predict(X_train))
st.plotly_chart(fig)
with columns[0]:
fig = residual_distribution(y_train, model.predict(X_train))
st.pyplot(fig)
update_db("6_AI_Model_Result.py")
elif auth_status == False:
st.error("Username/Password is incorrect")
try:
username_forgot_pw, email_forgot_password, random_password = (
authenticator.forgot_password("Forgot password")
)
if username_forgot_pw:
st.success("New password sent securely")
# Random password to be transferred to the user securely
elif username_forgot_pw == False:
st.error("Username not found")
except Exception as e:
st.error(e)