Spaces:

BlendMMM
/

v6Mastercardapp

Sleeping

App Files Files Community

v6Mastercardapp / pages /6_AI_Model_Results.py

BlendMMM

Upload 73 files

3b48627 over 1 year ago

raw

history blame

25.6 kB

	import plotly.express as px
	import numpy as np
	import plotly.graph_objects as go
	import streamlit as st
	import pandas as pd
	import statsmodels.api as sm
	from sklearn.metrics import mean_absolute_percentage_error
	import sys
	import os
	from utilities import set_header, load_local_css, load_authenticator
	import seaborn as sns
	import matplotlib.pyplot as plt
	import sweetviz as sv
	import tempfile
	from sklearn.preprocessing import MinMaxScaler
	from st_aggrid import AgGrid
	from st_aggrid import GridOptionsBuilder, GridUpdateMode
	from st_aggrid import GridOptionsBuilder
	import sys
	import re
	import pickle
	from sklearn.metrics import r2_score, mean_absolute_percentage_error
	from Data_prep_functions import plot_actual_vs_predicted
	import sqlite3
	from utilities import update_db

	sys.setrecursionlimit(10**6)

	original_stdout = sys.stdout
	sys.stdout = open("temp_stdout.txt", "w")
	sys.stdout.close()
	sys.stdout = original_stdout

	st.set_page_config(layout="wide")
	load_local_css("styles.css")
	set_header()

	# TODO :
	## 1. Add non panel model support
	## 2. EDA Function

	for k, v in st.session_state.items():
	if k not in ["logout", "login", "config"] and not k.startswith(
	"FormSubmitter"
	):
	st.session_state[k] = v

	authenticator = st.session_state.get("authenticator")
	if authenticator is None:
	authenticator = load_authenticator()

	name, authentication_status, username = authenticator.login("Login", "main")
	auth_status = st.session_state.get("authentication_status")

	if auth_status == True:
	is_state_initiaized = st.session_state.get("initialized", False)
	if not is_state_initiaized:
	if "session_name" not in st.session_state:
	st.session_state["session_name"] = None

	if "project_dct" not in st.session_state:
	st.error("Please load a project from Home page")
	st.stop()

	conn = sqlite3.connect(
	r"DB/User.db", check_same_thread=False
	) # connection with sql db
	c = conn.cursor()

	if not os.path.exists(
	os.path.join(st.session_state["project_path"], "tuned_model.pkl")
	):
	st.error("Please save a tuned model")
	st.stop()

	if (
	"session_state_saved"
	in st.session_state["project_dct"]["model_tuning"].keys()
	and st.session_state["project_dct"]["model_tuning"][
	"session_state_saved"
	]
	!= []
	):
	for key in ["used_response_metrics", "media_data", "bin_dict"]:
	if key not in st.session_state:
	st.session_state[key] = st.session_state["project_dct"][
	"model_tuning"
	]["session_state_saved"][key]
	st.session_state["bin_dict"] = st.session_state["project_dct"][
	"model_build"
	]["session_state_saved"]["bin_dict"]

	media_data = st.session_state["media_data"]
	panel_col = [
	col.lower()
	.replace(".", "_")
	.replace("@", "_")
	.replace(" ", "_")
	.replace("-", "")
	.replace(":", "")
	.replace("__", "_")
	for col in st.session_state["bin_dict"]["Panel Level 1"]
	][
	0
	] # set the panel column
	is_panel = True if len(panel_col) > 0 else False
	date_col = "date"

	def plot_residual_predicted(actual, predicted, df_):
	df_["Residuals"] = actual - pd.Series(predicted)
	df_["StdResidual"] = (
	df_["Residuals"] - df_["Residuals"].mean()
	) / df_["Residuals"].std()

	# Create a Plotly scatter plot
	fig = px.scatter(
	df_,
	x=predicted,
	y="StdResidual",
	opacity=0.5,
	color_discrete_sequence=["#11B6BD"],
	)

	# Add horizontal lines
	fig.add_hline(y=0, line_dash="dash", line_color="darkorange")
	fig.add_hline(y=2, line_color="red")
	fig.add_hline(y=-2, line_color="red")

	fig.update_xaxes(title="Predicted")
	fig.update_yaxes(title="Standardized Residuals (Actual - Predicted)")

	# Set the same width and height for both figures
	fig.update_layout(
	title="Residuals over Predicted Values",
	autosize=False,
	width=600,
	height=400,
	)

	return fig

	def residual_distribution(actual, predicted):
	Residuals = actual - pd.Series(predicted)

	# Create a Seaborn distribution plot
	sns.set(style="whitegrid")
	plt.figure(figsize=(6, 4))
	sns.histplot(Residuals, kde=True, color="#11B6BD")

	plt.title(" Distribution of Residuals")
	plt.xlabel("Residuals")
	plt.ylabel("Probability Density")

	return plt

	def qqplot(actual, predicted):
	Residuals = actual - pd.Series(predicted)
	Residuals = pd.Series(Residuals)
	Resud_std = (Residuals - Residuals.mean()) / Residuals.std()

	# Create a QQ plot using Plotly with custom colors
	fig = go.Figure()
	fig.add_trace(
	go.Scatter(
	x=sm.ProbPlot(Resud_std).theoretical_quantiles,
	y=sm.ProbPlot(Resud_std).sample_quantiles,
	mode="markers",
	marker=dict(size=5, color="#11B6BD"),
	name="QQ Plot",
	)
	)

	# Add the 45-degree reference line
	diagonal_line = go.Scatter(
	x=[
	-2,
	2,
	], # Adjust the x values as needed to fit the range of your data
	y=[-2, 2], # Adjust the y values accordingly
	mode="lines",
	line=dict(color="red"), # Customize the line color and style
	name=" ",
	)
	fig.add_trace(diagonal_line)

	# Customize the layout
	fig.update_layout(
	title="QQ Plot of Residuals",
	title_x=0.5,
	autosize=False,
	width=600,
	height=400,
	xaxis_title="Theoretical Quantiles",
	yaxis_title="Sample Quantiles",
	)

	return fig

	def get_random_effects(media_data, panel_col, mdf):
	random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"])
	for i, market in enumerate(media_data[panel_col].unique()):
	print(i, end="\r")
	intercept = mdf.random_effects[market].values[0]
	random_eff_df.loc[i, "random_effect"] = intercept
	random_eff_df.loc[i, panel_col] = market

	return random_eff_df

	def mdf_predict(X_df, mdf, random_eff_df):
	X = X_df.copy()
	X = pd.merge(
	X,
	random_eff_df[[panel_col, "random_effect"]],
	on=panel_col,
	how="left",
	)
	X["pred_fixed_effect"] = mdf.predict(X)

	X["pred"] = X["pred_fixed_effect"] + X["random_effect"]
	X.drop(columns=["pred_fixed_effect", "random_effect"], inplace=True)
	return X

	def metrics_df_panel(model_dict):
	metrics_df = pd.DataFrame(
	columns=[
	"Model",
	"R2",
	"ADJR2",
	"Train Mape",
	"Test Mape",
	"Summary",
	"Model_object",
	]
	)
	i = 0
	for key in model_dict.keys():
	target = key.split("__")[1]
	metrics_df.at[i, "Model"] = target
	y = model_dict[key]["X_train_tuned"][target]

	random_df = get_random_effects(
	media_data, panel_col, model_dict[key]["Model_object"]
	)
	pred = mdf_predict(
	model_dict[key]["X_train_tuned"],
	model_dict[key]["Model_object"],
	random_df,
	)["pred"]

	ytest = model_dict[key]["X_test_tuned"][target]
	predtest = mdf_predict(
	model_dict[key]["X_test_tuned"],
	model_dict[key]["Model_object"],
	random_df,
	)["pred"]

	metrics_df.at[i, "R2"] = r2_score(y, pred)
	metrics_df.at[i, "ADJR2"] = 1 - (1 - metrics_df.loc[i, "R2"]) * (
	len(y) - 1
	) / (len(y) - len(model_dict[key]["feature_set"]) - 1)
	metrics_df.at[i, "Train Mape"] = mean_absolute_percentage_error(
	y, pred
	)
	metrics_df.at[i, "Test Mape"] = mean_absolute_percentage_error(
	ytest, predtest
	)
	metrics_df.at[i, "Summary"] = model_dict[key][
	"Model_object"
	].summary()
	metrics_df.at[i, "Model_object"] = model_dict[key]["Model_object"]
	i += 1
	metrics_df = np.round(metrics_df, 2)
	return metrics_df

	with open(
	os.path.join(
	st.session_state["project_path"], "final_df_transformed.pkl"
	),
	"rb",
	) as f:
	data = pickle.load(f)
	transformed_data = data["final_df_transformed"]
	with open(
	os.path.join(st.session_state["project_path"], "data_import.pkl"), "rb"
	) as f:
	data = pickle.load(f)
	st.session_state["bin_dict"] = data["bin_dict"]
	with open(
	os.path.join(st.session_state["project_path"], "tuned_model.pkl"), "rb"
	) as file:
	tuned_model_dict = pickle.load(file)
	feature_set_dct = {
	key.split("__")[1]: key_dict["feature_set"]
	for key, key_dict in tuned_model_dict.items()
	}

	# """ the above part should be modified so that we are fetching features set from the saved model"""

	def contributions(X, model, target):
	X1 = X.copy()
	for j, col in enumerate(X1.columns):
	X1[col] = X1[col] * model.params.values[j]

	contributions = np.round(
	(X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2
	)
	contributions = (
	pd.DataFrame(contributions, columns=target)
	.reset_index()
	.rename(columns={"index": "Channel"})
	)
	contributions["Channel"] = [
	re.split(r"_imp\|_cli", col)[0] for col in contributions["Channel"]
	]

	return contributions

	if "contribution_df" not in st.session_state:
	st.session_state["contribution_df"] = None

	def contributions_panel(model_dict):
	media_data = st.session_state["media_data"]
	contribution_df = pd.DataFrame(columns=["Channel"])
	for key in model_dict.keys():
	best_feature_set = model_dict[key]["feature_set"]
	model = model_dict[key]["Model_object"]
	target = key.split("__")[1]
	X_train = model_dict[key]["X_train_tuned"]
	contri_df = pd.DataFrame()

	y = []
	y_pred = []

	random_eff_df = get_random_effects(media_data, panel_col, model)
	random_eff_df["fixed_effect"] = model.fe_params["Intercept"]
	random_eff_df["panel_effect"] = (
	random_eff_df["random_effect"] + random_eff_df["fixed_effect"]
	)

	coef_df = pd.DataFrame(model.fe_params)
	coef_df.reset_index(inplace=True)
	coef_df.columns = ["feature", "coef"]

	x_train_contribution = X_train.copy()
	x_train_contribution = mdf_predict(
	x_train_contribution, model, random_eff_df
	)

	x_train_contribution = pd.merge(
	x_train_contribution,
	random_eff_df[[panel_col, "panel_effect"]],
	on=panel_col,
	how="left",
	)

	for i in range(len(coef_df))[1:]:
	coef = coef_df.loc[i, "coef"]
	col = coef_df.loc[i, "feature"]
	x_train_contribution[str(col) + "_contr"] = (
	coef * x_train_contribution[col]
	)

	# x_train_contribution['sum_contributions'] = x_train_contribution.filter(regex="contr").sum(axis=1)
	# x_train_contribution['sum_contributions'] = x_train_contribution['sum_contributions'] + x_train_contribution[
	# 'panel_effect']

	base_cols = ["panel_effect"] + [
	c
	for c in x_train_contribution.filter(regex="contr").columns
	if c
	in [
	"Week_number_contr",
	"Trend_contr",
	"sine_wave_contr",
	"cosine_wave_contr",
	]
	]
	x_train_contribution["base_contr"] = x_train_contribution[
	base_cols
	].sum(axis=1)
	x_train_contribution.drop(columns=base_cols, inplace=True)
	# x_train_contribution.to_csv("Test/smr_x_train_contribution.csv", index=False)

	contri_df = pd.DataFrame(
	x_train_contribution.filter(regex="contr").sum(axis=0)
	)
	contri_df.reset_index(inplace=True)
	contri_df.columns = ["Channel", target]
	contri_df["Channel"] = (
	contri_df["Channel"]
	.str.split("(_impres\|_clicks)")
	.apply(lambda c: c[0])
	)
	contri_df[target] = (
	100 * contri_df[target] / contri_df[target].sum()
	)
	contri_df["Channel"].replace("base_contr", "base", inplace=True)
	contribution_df = pd.merge(
	contribution_df, contri_df, on="Channel", how="outer"
	)
	# st.session_state["contribution_df"] = contributions_panel(tuned_model_dict)
	return contribution_df

	metrics_table = metrics_df_panel(tuned_model_dict)

	eda_columns = st.columns(2)
	with eda_columns[1]:
	eda = st.button(
	"Generate EDA Report",
	help="Click to generate a bivariate report for the selected response metric from the table below.",
	)

	# st.markdown('Model Metrics')
	st.title("Contribution Overview")
	options = st.session_state["used_response_metrics"]
	options = [
	opt.lower()
	.replace(" ", "_")
	.replace("-", "")
	.replace(":", "")
	.replace("__", "_")
	for opt in options
	]

	default_options = (
	st.session_state["project_dct"]["saved_model_results"].get(
	"selected_options"
	)
	if st.session_state["project_dct"]["saved_model_results"].get(
	"selected_options"
	)
	is not None
	else [options[-1]]
	)
	for i in default_options:
	if i not in options:
	st.write(i)
	default_options.remove(i)
	contribution_selections = st.multiselect(
	"Select the Response Metrics to compare contributions",
	options,
	default=default_options,
	)
	trace_data = []

	st.session_state["contribution_df"] = contributions_panel(tuned_model_dict)

	for selection in contribution_selections:

	trace = go.Bar(
	x=st.session_state["contribution_df"]["Channel"],
	y=st.session_state["contribution_df"][selection],
	name=selection,
	text=np.round(st.session_state["contribution_df"][selection], 0)
	.astype(int)
	.astype(str)
	+ "%",
	textposition="outside",
	)
	trace_data.append(trace)

	layout = go.Layout(
	title="Metrics Contribution by Channel",
	xaxis=dict(title="Channel Name"),
	yaxis=dict(title="Metrics Contribution"),
	barmode="group",
	)
	fig = go.Figure(data=trace_data, layout=layout)
	st.plotly_chart(fig, use_container_width=True)

	############################################ Waterfall Chart ############################################
	# import plotly.graph_objects as go

	# # Initialize a Plotly figure
	# fig = go.Figure()

	# for selection in contribution_selections:
	# # Ensure y_values are numeric
	# y_values = st.session_state["contribution_df"][selection].values.astype(float)

	# # Generating text labels for each bar, ensuring operations are compatible with string formats
	# text_values = [f"{val}%" for val in np.round(y_values, 0).astype(int)]

	# fig.add_trace(
	# go.Waterfall(
	# name=selection,
	# orientation="v",
	# measure=["relative"]
	# * len(y_values), # Adjust if you have absolute values at certain points
	# x=st.session_state["contribution_df"]["Channel"].tolist(),
	# text=text_values,
	# textposition="outside",
	# y=y_values,
	# increasing={"marker": {"color": "green"}},
	# decreasing={"marker": {"color": "red"}},
	# totals={"marker": {"color": "blue"}},
	# )
	# )

	# fig.update_layout(
	# title="Metrics Contribution by Channel",
	# xaxis={"title": "Channel Name"},
	# yaxis={"title": "Metrics Contribution"},
	# height=600,
	# )

	# # Displaying the waterfall chart in Streamlit
	# st.plotly_chart(fig, use_container_width=True)

	import plotly.graph_objects as go

	# Initialize a Plotly figure
	fig = go.Figure()

	for selection in contribution_selections:
	# Ensure contributions are numeric
	contributions = (
	st.session_state["contribution_df"][selection]
	.values.astype(float)
	.tolist()
	)
	channel_names = st.session_state["contribution_df"]["Channel"].tolist()

	display_name, display_contribution, base_contribution = [], [], 0
	for channel_name, contribution in zip(channel_names, contributions):
	if channel_name != "const" and channel_name != "base":
	display_name.append(channel_name)
	display_contribution.append(contribution)
	else:
	base_contribution = contribution

	display_name = ["Base Sales"] + display_name
	display_contribution = [base_contribution] + display_contribution

	# Generating text labels for each bar, ensuring operations are compatible with string formats
	text_values = [
	f"{val}%" for val in np.round(display_contribution, 0).astype(int)
	]

	fig.add_trace(
	go.Waterfall(
	orientation="v",
	measure=["relative"]
	* len(
	display_contribution
	), # Adjust if you have absolute values at certain points
	x=display_name,
	text=text_values,
	textposition="outside",
	y=display_contribution,
	increasing={"marker": {"color": "green"}},
	decreasing={"marker": {"color": "red"}},
	totals={"marker": {"color": "blue"}},
	)
	)

	fig.update_layout(
	title="Metrics Contribution by Channel",
	xaxis={"title": "Channel Name"},
	yaxis={"title": "Metrics Contribution"},
	height=600,
	)

	# Displaying the waterfall chart in Streamlit
	st.plotly_chart(fig, use_container_width=True)

	############################################ Waterfall Chart ############################################

	st.title("Analysis of Models Result")
	# st.markdown()
	previous_selection = st.session_state["project_dct"][
	"saved_model_results"
	].get("model_grid_sel", [1])
	st.write(np.round(metrics_table, 2))
	gd_table = metrics_table.iloc[:, :-2]

	gd = GridOptionsBuilder.from_dataframe(gd_table)
	# gd.configure_pagination(enabled=True)
	gd.configure_selection(
	use_checkbox=True,
	selection_mode="single",
	pre_select_all_rows=False,
	pre_selected_rows=previous_selection,
	)

	gridoptions = gd.build()
	table = AgGrid(
	gd_table,
	gridOptions=gridoptions,
	fit_columns_on_grid_load=True,
	height=200,
	)
	# table=metrics_table.iloc[:,:-2]
	# table.insert(0, "Select", False)
	# selection_table=st.data_editor(table,column_config={"Select": st.column_config.CheckboxColumn(required=True)})
	if len(table.selected_rows) > 0:
	st.session_state["project_dct"]["saved_model_results"][
	"model_grid_sel"
	] = table.selected_rows[0]["_selectedRowNodeInfo"]["nodeRowIndex"]
	if len(table.selected_rows) == 0:
	st.warning(
	"Click on the checkbox to view comprehensive results of the selected model."
	)
	st.stop()
	else:
	target_column = table.selected_rows[0]["Model"]
	feature_set = feature_set_dct[target_column]

	# with eda_columns[1]:
	# if eda:
	# def generate_report_with_target(channel_data, target_feature):
	# report = sv.analyze(
	# [channel_data, "Dataset"], target_feat=target_feature, verbose=False
	# )
	# temp_dir = tempfile.mkdtemp()
	# report_path = os.path.join(temp_dir, "report.html")
	# report.show_html(
	# filepath=report_path, open_browser=False
	# ) # Generate the report as an HTML file
	# return report_path
	#
	# report_data = transformed_data[feature_set]
	# report_data[target_column] = transformed_data[target_column]
	# report_file = generate_report_with_target(report_data, target_column)
	#
	# if os.path.exists(report_file):
	# with open(report_file, "rb") as f:
	# st.download_button(
	# label="Download EDA Report",
	# data=f.read(),
	# file_name="report.html",
	# mime="text/html",
	# )
	# else:
	# st.warning("Report generation failed. Unable to find the report file.")

	model = metrics_table[metrics_table["Model"] == target_column][
	"Model_object"
	].iloc[0]
	target = metrics_table[metrics_table["Model"] == target_column][
	"Model"
	].iloc[0]
	st.header("Model Summary")
	st.write(model.summary())

	sel_dict = tuned_model_dict[
	[k for k in tuned_model_dict.keys() if k.split("__")[1] == target][0]
	]
	X_train = sel_dict["X_train_tuned"]
	y_train = X_train[target]
	random_effects = get_random_effects(media_data, panel_col, model)
	pred = mdf_predict(X_train, model, random_effects)["pred"]

	X_test = sel_dict["X_test_tuned"]
	y_test = X_test[target]
	predtest = mdf_predict(X_test, model, random_effects)["pred"]
	metrics_table_train, _, fig_train = plot_actual_vs_predicted(
	X_train[date_col],
	y_train,
	pred,
	model,
	target_column=target_column,
	flag=None,
	repeat_all_years=False,
	is_panel=is_panel,
	)

	metrics_table_test, _, fig_test = plot_actual_vs_predicted(
	X_test[date_col],
	y_test,
	predtest,
	model,
	target_column=target_column,
	flag=None,
	repeat_all_years=False,
	is_panel=is_panel,
	)

	metrics_table_train = metrics_table_train.set_index("Metric").transpose()
	metrics_table_train.index = ["Train"]
	metrics_table_test = metrics_table_test.set_index("Metric").transpose()
	metrics_table_test.index = ["test"]
	metrics_table = np.round(
	pd.concat([metrics_table_train, metrics_table_test]), 2
	)

	st.markdown("Result Overview")
	st.dataframe(np.round(metrics_table, 2), use_container_width=True)

	st.subheader("Actual vs Predicted Plot Train")

	st.plotly_chart(fig_train, use_container_width=True)
	st.subheader("Actual vs Predicted Plot Test")
	st.plotly_chart(fig_test, use_container_width=True)

	st.markdown("## Residual Analysis")
	columns = st.columns(2)

	Xtrain1 = X_train.copy()
	with columns[0]:
	fig = plot_residual_predicted(y_train, model.predict(Xtrain1), Xtrain1)
	st.plotly_chart(fig)

	with columns[1]:
	st.empty()
	fig = qqplot(y_train, model.predict(X_train))
	st.plotly_chart(fig)

	with columns[0]:
	fig = residual_distribution(y_train, model.predict(X_train))
	st.pyplot(fig)

	update_db("6_AI_Model_Result.py")


	elif auth_status == False:
	st.error("Username/Password is incorrect")
	try:
	username_forgot_pw, email_forgot_password, random_password = (
	authenticator.forgot_password("Forgot password")
	)
	if username_forgot_pw:
	st.success("New password sent securely")
	# Random password to be transferred to the user securely
	elif username_forgot_pw == False:
	st.error("Username not found")
	except Exception as e:
	st.error(e)