Mastercard / pages /4_Saved_Model_Results.py
BlendMMM's picture
Upload 10 files
bd80083 verified
raw
history blame
21 kB
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import streamlit as st
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_percentage_error
import sys
import os
from utilities import set_header, load_local_css, load_authenticator
import seaborn as sns
import matplotlib.pyplot as plt
import sweetviz as sv
import tempfile
from sklearn.preprocessing import MinMaxScaler
from st_aggrid import AgGrid
from st_aggrid import GridOptionsBuilder, GridUpdateMode
from st_aggrid import GridOptionsBuilder
import sys
import re
sys.setrecursionlimit(10**6)
original_stdout = sys.stdout
sys.stdout = open("temp_stdout.txt", "w")
sys.stdout.close()
sys.stdout = original_stdout
st.set_page_config(layout="wide")
load_local_css("styles.css")
set_header()
for k, v in st.session_state.items():
if k not in ["logout", "login", "config"] and not k.startswith("FormSubmitter"):
st.session_state[k] = v
authenticator = st.session_state.get("authenticator")
if authenticator is None:
authenticator = load_authenticator()
name, authentication_status, username = authenticator.login("Login", "main")
auth_status = st.session_state.get("authentication_status")
if auth_status == True:
is_state_initiaized = st.session_state.get("initialized", False)
if not is_state_initiaized:
a = 1
def plot_residual_predicted(actual, predicted, df_):
df_["Residuals"] = actual - pd.Series(predicted)
df_["StdResidual"] = (df_["Residuals"] - df_["Residuals"].mean()) / df_[
"Residuals"
].std()
# Create a Plotly scatter plot
fig = px.scatter(
df_,
x=predicted,
y="StdResidual",
opacity=0.5,
color_discrete_sequence=["#11B6BD"],
)
# Add horizontal lines
fig.add_hline(y=0, line_dash="dash", line_color="darkorange")
fig.add_hline(y=2, line_color="red")
fig.add_hline(y=-2, line_color="red")
fig.update_xaxes(title="Predicted")
fig.update_yaxes(title="Standardized Residuals (Actual - Predicted)")
# Set the same width and height for both figures
fig.update_layout(
title="Residuals over Predicted Values",
autosize=False,
width=600,
height=400,
)
return fig
def residual_distribution(actual, predicted):
Residuals = actual - pd.Series(predicted)
# Create a Seaborn distribution plot
sns.set(style="whitegrid")
plt.figure(figsize=(6, 4))
sns.histplot(Residuals, kde=True, color="#11B6BD")
plt.title(" Distribution of Residuals")
plt.xlabel("Residuals")
plt.ylabel("Probability Density")
return plt
def qqplot(actual, predicted):
Residuals = actual - pd.Series(predicted)
Residuals = pd.Series(Residuals)
Resud_std = (Residuals - Residuals.mean()) / Residuals.std()
# Create a QQ plot using Plotly with custom colors
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=sm.ProbPlot(Resud_std).theoretical_quantiles,
y=sm.ProbPlot(Resud_std).sample_quantiles,
mode="markers",
marker=dict(size=5, color="#11B6BD"),
name="QQ Plot",
)
)
# Add the 45-degree reference line
diagonal_line = go.Scatter(
x=[-2, 2], # Adjust the x values as needed to fit the range of your data
y=[-2, 2], # Adjust the y values accordingly
mode="lines",
line=dict(color="red"), # Customize the line color and style
name=" ",
)
fig.add_trace(diagonal_line)
# Customize the layout
fig.update_layout(
title="QQ Plot of Residuals",
title_x=0.5,
autosize=False,
width=600,
height=400,
xaxis_title="Theoretical Quantiles",
yaxis_title="Sample Quantiles",
)
return fig
def plot_actual_vs_predicted(date, y, predicted_values, model):
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=date, y=y, mode="lines", name="Actual", line=dict(color="blue")
)
)
fig.add_trace(
go.Scatter(
x=date,
y=predicted_values,
mode="lines",
name="Predicted",
line=dict(color="orange"),
)
)
# Calculate MAPE
mape = mean_absolute_percentage_error(y, predicted_values) * 100
# Calculate R-squared
rss = np.sum((y - predicted_values) ** 2)
tss = np.sum((y - np.mean(y)) ** 2)
r_squared = 1 - (rss / tss)
# Get the number of predictors
num_predictors = model.df_model
# Get the number of samples
num_samples = len(y)
# Calculate Adjusted R-squared
adj_r_squared = 1 - (
(1 - r_squared) * ((num_samples - 1) / (num_samples - num_predictors - 1))
)
metrics_table = pd.DataFrame(
{
"Metric": ["MAPE", "R-squared", "AdjR-squared"],
"Value": [mape, r_squared, adj_r_squared],
}
)
fig.update_layout(
xaxis=dict(title="Date"),
yaxis=dict(title="Value"),
title=f"MAPE : {mape:.2f}%, AdjR2: {adj_r_squared:.2f}",
xaxis_tickangle=-30,
)
return metrics_table, fig
def contributions(X, model):
X1 = X.copy()
for j, col in enumerate(X1.columns):
X1[col] = X1[col] * model.params.values[j]
return np.round(
(X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2
)
transformed_data = pd.read_csv("transformed_data.csv")
# hard coded for now, need to get features set from model
feature_set_dct = {
"app_installs_-_appsflyer": [
"paid_search_clicks",
"fb:_level_achieved_-_tier_1_impressions_lag2",
"fb:_level_achieved_-_tier_2_clicks_lag2",
"paid_social_others_impressions_adst.1",
"ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag2",
"digital_tactic_others_clicks",
"kwai_clicks_adst.3",
"programmaticclicks",
"indicacao_clicks_adst.1",
"infleux_clicks_adst.4",
"influencer_clicks",
],
"account_requests_-_appsflyer": [
"paid_search_impressions",
"fb:_level_achieved_-_tier_1_clicks_adst.1",
"fb:_level_achieved_-_tier_2_clicks_adst.1",
"paid_social_others_clicks_lag2",
"ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag5_adst.1",
"digital_tactic_others_clicks_adst.1",
"kwai_clicks_adst.2",
"programmaticimpressions_lag4_adst.1",
"indicacao_clicks",
"infleux_clicks_adst.2",
"influencer_clicks",
],
"total_approved_accounts_-_appsflyer": [
"paid_search_clicks",
"fb:_level_achieved_-_tier_1_impressions_lag2_adst.1",
"fb:_level_achieved_-_tier_2_impressions_lag2",
"paid_social_others_clicks_lag2_adst.2",
"ga_app:_will_and_cid_pequena_baixo_risco_impressions_lag4",
"digital_tactic_others_clicks",
"kwai_impressions_adst.2",
"programmaticclicks_adst.5",
"indicacao_clicks_adst.1",
"infleux_clicks_adst.3",
"influencer_clicks",
],
"total_approved_accounts_-_revenue": [
"paid_search_impressions_adst.5",
"kwai_impressions_lag2_adst.3",
"indicacao_clicks_adst.3",
"infleux_clicks_adst.3",
"programmaticclicks_adst.4",
"influencer_clicks_adst.3",
"fb:_level_achieved_-_tier_1_impressions_adst.2",
"fb:_level_achieved_-_tier_2_impressions_lag3_adst.5",
"paid_social_others_impressions_adst.3",
"ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag3_adst.5",
"digital_tactic_others_clicks_adst.2",
],
}
# """ the above part should be modified so that we are fetching features set from the saved model"""
def contributions(X, model, target):
X1 = X.copy()
for j, col in enumerate(X1.columns):
X1[col] = X1[col] * model.params.values[j]
contributions = np.round(
(X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2
)
contributions = (
pd.DataFrame(contributions, columns=target)
.reset_index()
.rename(columns={"index": "Channel"})
)
contributions["Channel"] = [
re.split(r"_imp|_cli", col)[0] for col in contributions["Channel"]
]
return contributions
def model_fit(features_set, target):
X = transformed_data[features_set]
y = transformed_data[target]
ss = MinMaxScaler()
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
X = sm.add_constant(X)
X_train = X.iloc[:150]
X_test = X.iloc[150:]
y_train = y.iloc[:150]
y_test = y.iloc[150:]
model = sm.OLS(y_train, X_train).fit()
predicted_values_train = model.predict(X_train)
r2 = model.rsquared
adjr2 = model.rsquared_adj
train_mape = mean_absolute_percentage_error(y_train, predicted_values_train)
test_mape = mean_absolute_percentage_error(y_test, model.predict(X_test))
summary = model.summary()
train_contributions = contributions(X_train, model, [target])
return (
pd.DataFrame(
{
"Model": target,
"R2": np.round(r2, 2),
"ADJr2": np.round(adjr2, 2),
"Train Mape": np.round(train_mape, 2),
"Test Mape": np.round(test_mape, 2),
"Summary": summary,
"Model_object": model,
},
index=[0],
),
train_contributions,
)
metrics_table = pd.DataFrame()
if "contribution_df" not in st.session_state:
st.session_state["contribution_df"] = pd.DataFrame()
for target, feature_set in feature_set_dct.items():
metrics_table = pd.concat(
[metrics_table, model_fit(features_set=feature_set, target=target)[0]]
)
if st.session_state["contribution_df"].empty:
st.session_state["contribution_df"] = model_fit(
features_set=feature_set, target=target
)[1]
else:
st.session_state["contribution_df"] = pd.merge(
st.session_state["contribution_df"],
model_fit(features_set=feature_set, target=target)[1],
)
# st.write(st.session_state["contribution_df"])
metrics_table.reset_index(drop=True, inplace=True)
eda_columns = st.columns(2)
with eda_columns[1]:
eda = st.button(
"Generate EDA Report",
help="Click to generate a bivariate report for the selected response metric from the table below.",
)
# st.markdown('Model Metrics')
st.title("Contribution Overview")
contribution_selections = st.multiselect(
"Select the models to compare contributions",
[
col
for col in st.session_state["contribution_df"].columns
if col.lower() != "channel"
],
default=[
col
for col in st.session_state["contribution_df"].columns
if col.lower() != "channel"
][-1],
)
trace_data = []
for selection in contribution_selections:
trace = go.Bar(
x=st.session_state["contribution_df"]["Channel"],
y=st.session_state["contribution_df"][selection],
name=selection,
text=np.round(st.session_state["contribution_df"][selection], 0)
.astype(int)
.astype(str)
+ "%",
textposition="outside",
)
trace_data.append(trace)
layout = go.Layout(
title="Metrics Contribution by Channel",
xaxis=dict(title="Channel Name"),
yaxis=dict(title="Metrics Contribution"),
barmode="group",
)
fig = go.Figure(data=trace_data, layout=layout)
st.plotly_chart(fig, use_container_width=True)
############################################ Waterfall Chart ############################################
# import plotly.graph_objects as go
# # Initialize a Plotly figure
# fig = go.Figure()
# for selection in contribution_selections:
# # Ensure y_values are numeric
# y_values = st.session_state["contribution_df"][selection].values.astype(float)
# # Generating text labels for each bar, ensuring operations are compatible with string formats
# text_values = [f"{val}%" for val in np.round(y_values, 0).astype(int)]
# fig.add_trace(
# go.Waterfall(
# name=selection,
# orientation="v",
# measure=["relative"]
# * len(y_values), # Adjust if you have absolute values at certain points
# x=st.session_state["contribution_df"]["Channel"].tolist(),
# text=text_values,
# textposition="outside",
# y=y_values,
# increasing={"marker": {"color": "green"}},
# decreasing={"marker": {"color": "red"}},
# totals={"marker": {"color": "blue"}},
# )
# )
# fig.update_layout(
# title="Metrics Contribution by Channel",
# xaxis={"title": "Channel Name"},
# yaxis={"title": "Metrics Contribution"},
# height=600,
# )
# # Displaying the waterfall chart in Streamlit
# st.plotly_chart(fig, use_container_width=True)
import plotly.graph_objects as go
# Initialize a Plotly figure
fig = go.Figure()
for selection in contribution_selections:
# Ensure contributions are numeric
contributions = (
st.session_state["contribution_df"][selection].values.astype(float).tolist()
)
channel_names = st.session_state["contribution_df"]["Channel"].tolist()
display_name, display_contribution, base_contribution = [], [], 0
for channel_name, contribution in zip(channel_names, contributions):
if channel_name != "const":
display_name.append(channel_name)
display_contribution.append(contribution)
else:
base_contribution = contribution
display_name = ["Base Sales"] + display_name
display_contribution = [base_contribution] + display_contribution
# Generating text labels for each bar, ensuring operations are compatible with string formats
text_values = [
f"{val}%" for val in np.round(display_contribution, 0).astype(int)
]
fig.add_trace(
go.Waterfall(
orientation="v",
measure=["relative"]
* len(
display_contribution
), # Adjust if you have absolute values at certain points
x=display_name,
text=text_values,
textposition="outside",
y=display_contribution,
increasing={"marker": {"color": "green"}},
decreasing={"marker": {"color": "red"}},
totals={"marker": {"color": "blue"}},
)
)
fig.update_layout(
title="Metrics Contribution by Channel",
xaxis={"title": "Channel Name"},
yaxis={"title": "Metrics Contribution"},
height=600,
)
# Displaying the waterfall chart in Streamlit
st.plotly_chart(fig, use_container_width=True)
############################################ Waterfall Chart ############################################
st.title("Analysis of Models Result")
# st.markdown()
gd_table = metrics_table.iloc[:, :-2]
gd = GridOptionsBuilder.from_dataframe(gd_table)
# gd.configure_pagination(enabled=True)
gd.configure_selection(
use_checkbox=True,
selection_mode="single",
pre_select_all_rows=False,
pre_selected_rows=[1],
)
gridoptions = gd.build()
table = AgGrid(
gd_table, gridOptions=gridoptions, fit_columns_on_grid_load=True, height=200
)
# table=metrics_table.iloc[:,:-2]
# table.insert(0, "Select", False)
# selection_table=st.data_editor(table,column_config={"Select": st.column_config.CheckboxColumn(required=True)})
if len(table.selected_rows) == 0:
st.warning(
"Click on the checkbox to view comprehensive results of the selected model."
)
st.stop()
else:
target_column = table.selected_rows[0]["Model"]
feature_set = feature_set_dct[target_column]
with eda_columns[1]:
if eda:
def generate_report_with_target(channel_data, target_feature):
report = sv.analyze(
[channel_data, "Dataset"], target_feat=target_feature, verbose=False
)
temp_dir = tempfile.mkdtemp()
report_path = os.path.join(temp_dir, "report.html")
report.show_html(
filepath=report_path, open_browser=False
) # Generate the report as an HTML file
return report_path
report_data = transformed_data[feature_set]
report_data[target_column] = transformed_data[target_column]
report_file = generate_report_with_target(report_data, target_column)
if os.path.exists(report_file):
with open(report_file, "rb") as f:
st.download_button(
label="Download EDA Report",
data=f.read(),
file_name="report.html",
mime="text/html",
)
else:
st.warning("Report generation failed. Unable to find the report file.")
model = metrics_table[metrics_table["Model"] == target_column]["Model_object"].iloc[
0
]
st.header("Model Summary")
st.write(model.summary())
X = transformed_data[feature_set]
ss = MinMaxScaler()
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
X = sm.add_constant(X)
y = transformed_data[target_column]
X_train = X.iloc[:150]
X_test = X.iloc[150:]
y_train = y.iloc[:150]
y_test = y.iloc[150:]
X.index = transformed_data["date"]
y.index = transformed_data["date"]
metrics_table_train, fig_train = plot_actual_vs_predicted(
X_train.index, y_train, model.predict(X_train), model
)
metrics_table_test, fig_test = plot_actual_vs_predicted(
X_test.index, y_test, model.predict(X_test), model
)
metrics_table_train = metrics_table_train.set_index("Metric").transpose()
metrics_table_train.index = ["Train"]
metrics_table_test = metrics_table_test.set_index("Metric").transpose()
metrics_table_test.index = ["test"]
metrics_table = np.round(pd.concat([metrics_table_train, metrics_table_test]), 2)
st.markdown("Result Overview")
st.dataframe(np.round(metrics_table, 2), use_container_width=True)
st.subheader("Actual vs Predicted Plot Train")
st.plotly_chart(fig_train, use_container_width=True)
st.subheader("Actual vs Predicted Plot Test")
st.plotly_chart(fig_test, use_container_width=True)
st.markdown("## Residual Analysis")
columns = st.columns(2)
Xtrain1 = X_train.copy()
with columns[0]:
fig = plot_residual_predicted(y_train, model.predict(Xtrain1), Xtrain1)
st.plotly_chart(fig)
with columns[1]:
st.empty()
fig = qqplot(y_train, model.predict(X_train))
st.plotly_chart(fig)
with columns[0]:
fig = residual_distribution(y_train, model.predict(X_train))
st.pyplot(fig)
elif auth_status == False:
st.error("Username/Password is incorrect")
try:
username_forgot_pw, email_forgot_password, random_password = (
authenticator.forgot_password("Forgot password")
)
if username_forgot_pw:
st.success("New password sent securely")
# Random password to be transferred to the user securely
elif username_forgot_pw == False:
st.error("Username not found")
except Exception as e:
st.error(e)