Spaces:

BlendMMM
/

Mastercard

Sleeping

App Files Files Community

Mastercard / pages /4_Saved_Model_Results.py

BlendMMM

Upload 10 files

bd80083 verified over 1 year ago

raw

history blame

21 kB

	import plotly.express as px
	import numpy as np
	import plotly.graph_objects as go
	import streamlit as st
	import pandas as pd
	import statsmodels.api as sm
	from sklearn.metrics import mean_absolute_percentage_error
	import sys
	import os
	from utilities import set_header, load_local_css, load_authenticator
	import seaborn as sns
	import matplotlib.pyplot as plt
	import sweetviz as sv
	import tempfile
	from sklearn.preprocessing import MinMaxScaler
	from st_aggrid import AgGrid
	from st_aggrid import GridOptionsBuilder, GridUpdateMode
	from st_aggrid import GridOptionsBuilder
	import sys
	import re

	sys.setrecursionlimit(10**6)

	original_stdout = sys.stdout
	sys.stdout = open("temp_stdout.txt", "w")
	sys.stdout.close()
	sys.stdout = original_stdout

	st.set_page_config(layout="wide")
	load_local_css("styles.css")
	set_header()

	for k, v in st.session_state.items():
	if k not in ["logout", "login", "config"] and not k.startswith("FormSubmitter"):
	st.session_state[k] = v

	authenticator = st.session_state.get("authenticator")
	if authenticator is None:
	authenticator = load_authenticator()

	name, authentication_status, username = authenticator.login("Login", "main")
	auth_status = st.session_state.get("authentication_status")

	if auth_status == True:
	is_state_initiaized = st.session_state.get("initialized", False)
	if not is_state_initiaized:
	a = 1

	def plot_residual_predicted(actual, predicted, df_):
	df_["Residuals"] = actual - pd.Series(predicted)
	df_["StdResidual"] = (df_["Residuals"] - df_["Residuals"].mean()) / df_[
	"Residuals"
	].std()

	# Create a Plotly scatter plot
	fig = px.scatter(
	df_,
	x=predicted,
	y="StdResidual",
	opacity=0.5,
	color_discrete_sequence=["#11B6BD"],
	)

	# Add horizontal lines
	fig.add_hline(y=0, line_dash="dash", line_color="darkorange")
	fig.add_hline(y=2, line_color="red")
	fig.add_hline(y=-2, line_color="red")

	fig.update_xaxes(title="Predicted")
	fig.update_yaxes(title="Standardized Residuals (Actual - Predicted)")

	# Set the same width and height for both figures
	fig.update_layout(
	title="Residuals over Predicted Values",
	autosize=False,
	width=600,
	height=400,
	)

	return fig

	def residual_distribution(actual, predicted):
	Residuals = actual - pd.Series(predicted)

	# Create a Seaborn distribution plot
	sns.set(style="whitegrid")
	plt.figure(figsize=(6, 4))
	sns.histplot(Residuals, kde=True, color="#11B6BD")

	plt.title(" Distribution of Residuals")
	plt.xlabel("Residuals")
	plt.ylabel("Probability Density")

	return plt

	def qqplot(actual, predicted):
	Residuals = actual - pd.Series(predicted)
	Residuals = pd.Series(Residuals)
	Resud_std = (Residuals - Residuals.mean()) / Residuals.std()

	# Create a QQ plot using Plotly with custom colors
	fig = go.Figure()
	fig.add_trace(
	go.Scatter(
	x=sm.ProbPlot(Resud_std).theoretical_quantiles,
	y=sm.ProbPlot(Resud_std).sample_quantiles,
	mode="markers",
	marker=dict(size=5, color="#11B6BD"),
	name="QQ Plot",
	)
	)

	# Add the 45-degree reference line
	diagonal_line = go.Scatter(
	x=[-2, 2], # Adjust the x values as needed to fit the range of your data
	y=[-2, 2], # Adjust the y values accordingly
	mode="lines",
	line=dict(color="red"), # Customize the line color and style
	name=" ",
	)
	fig.add_trace(diagonal_line)

	# Customize the layout
	fig.update_layout(
	title="QQ Plot of Residuals",
	title_x=0.5,
	autosize=False,
	width=600,
	height=400,
	xaxis_title="Theoretical Quantiles",
	yaxis_title="Sample Quantiles",
	)

	return fig

	def plot_actual_vs_predicted(date, y, predicted_values, model):

	fig = go.Figure()

	fig.add_trace(
	go.Scatter(
	x=date, y=y, mode="lines", name="Actual", line=dict(color="blue")
	)
	)
	fig.add_trace(
	go.Scatter(
	x=date,
	y=predicted_values,
	mode="lines",
	name="Predicted",
	line=dict(color="orange"),
	)
	)

	# Calculate MAPE
	mape = mean_absolute_percentage_error(y, predicted_values) * 100

	# Calculate R-squared
	rss = np.sum((y - predicted_values) ** 2)
	tss = np.sum((y - np.mean(y)) ** 2)
	r_squared = 1 - (rss / tss)

	# Get the number of predictors
	num_predictors = model.df_model

	# Get the number of samples
	num_samples = len(y)

	# Calculate Adjusted R-squared
	adj_r_squared = 1 - (
	(1 - r_squared) * ((num_samples - 1) / (num_samples - num_predictors - 1))
	)
	metrics_table = pd.DataFrame(
	{
	"Metric": ["MAPE", "R-squared", "AdjR-squared"],
	"Value": [mape, r_squared, adj_r_squared],
	}
	)
	fig.update_layout(
	xaxis=dict(title="Date"),
	yaxis=dict(title="Value"),
	title=f"MAPE : {mape:.2f}%, AdjR2: {adj_r_squared:.2f}",
	xaxis_tickangle=-30,
	)

	return metrics_table, fig

	def contributions(X, model):
	X1 = X.copy()
	for j, col in enumerate(X1.columns):
	X1[col] = X1[col] * model.params.values[j]

	return np.round(
	(X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2
	)

	transformed_data = pd.read_csv("transformed_data.csv")

	# hard coded for now, need to get features set from model

	feature_set_dct = {
	"app_installs_-_appsflyer": [
	"paid_search_clicks",
	"fb:_level_achieved_-_tier_1_impressions_lag2",
	"fb:_level_achieved_-_tier_2_clicks_lag2",
	"paid_social_others_impressions_adst.1",
	"ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag2",
	"digital_tactic_others_clicks",
	"kwai_clicks_adst.3",
	"programmaticclicks",
	"indicacao_clicks_adst.1",
	"infleux_clicks_adst.4",
	"influencer_clicks",
	],
	"account_requests_-_appsflyer": [
	"paid_search_impressions",
	"fb:_level_achieved_-_tier_1_clicks_adst.1",
	"fb:_level_achieved_-_tier_2_clicks_adst.1",
	"paid_social_others_clicks_lag2",
	"ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag5_adst.1",
	"digital_tactic_others_clicks_adst.1",
	"kwai_clicks_adst.2",
	"programmaticimpressions_lag4_adst.1",
	"indicacao_clicks",
	"infleux_clicks_adst.2",
	"influencer_clicks",
	],
	"total_approved_accounts_-_appsflyer": [
	"paid_search_clicks",
	"fb:_level_achieved_-_tier_1_impressions_lag2_adst.1",
	"fb:_level_achieved_-_tier_2_impressions_lag2",
	"paid_social_others_clicks_lag2_adst.2",
	"ga_app:_will_and_cid_pequena_baixo_risco_impressions_lag4",
	"digital_tactic_others_clicks",
	"kwai_impressions_adst.2",
	"programmaticclicks_adst.5",
	"indicacao_clicks_adst.1",
	"infleux_clicks_adst.3",
	"influencer_clicks",
	],
	"total_approved_accounts_-_revenue": [
	"paid_search_impressions_adst.5",
	"kwai_impressions_lag2_adst.3",
	"indicacao_clicks_adst.3",
	"infleux_clicks_adst.3",
	"programmaticclicks_adst.4",
	"influencer_clicks_adst.3",
	"fb:_level_achieved_-_tier_1_impressions_adst.2",
	"fb:_level_achieved_-_tier_2_impressions_lag3_adst.5",
	"paid_social_others_impressions_adst.3",
	"ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag3_adst.5",
	"digital_tactic_others_clicks_adst.2",
	],
	}

	# """ the above part should be modified so that we are fetching features set from the saved model"""

	def contributions(X, model, target):
	X1 = X.copy()
	for j, col in enumerate(X1.columns):
	X1[col] = X1[col] * model.params.values[j]

	contributions = np.round(
	(X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2
	)
	contributions = (
	pd.DataFrame(contributions, columns=target)
	.reset_index()
	.rename(columns={"index": "Channel"})
	)
	contributions["Channel"] = [
	re.split(r"_imp\|_cli", col)[0] for col in contributions["Channel"]
	]

	return contributions

	def model_fit(features_set, target):
	X = transformed_data[features_set]
	y = transformed_data[target]
	ss = MinMaxScaler()
	X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
	X = sm.add_constant(X)
	X_train = X.iloc[:150]
	X_test = X.iloc[150:]
	y_train = y.iloc[:150]
	y_test = y.iloc[150:]
	model = sm.OLS(y_train, X_train).fit()
	predicted_values_train = model.predict(X_train)
	r2 = model.rsquared
	adjr2 = model.rsquared_adj
	train_mape = mean_absolute_percentage_error(y_train, predicted_values_train)
	test_mape = mean_absolute_percentage_error(y_test, model.predict(X_test))
	summary = model.summary()
	train_contributions = contributions(X_train, model, [target])
	return (
	pd.DataFrame(
	{
	"Model": target,
	"R2": np.round(r2, 2),
	"ADJr2": np.round(adjr2, 2),
	"Train Mape": np.round(train_mape, 2),
	"Test Mape": np.round(test_mape, 2),
	"Summary": summary,
	"Model_object": model,
	},
	index=[0],
	),
	train_contributions,
	)

	metrics_table = pd.DataFrame()

	if "contribution_df" not in st.session_state:
	st.session_state["contribution_df"] = pd.DataFrame()

	for target, feature_set in feature_set_dct.items():
	metrics_table = pd.concat(
	[metrics_table, model_fit(features_set=feature_set, target=target)[0]]
	)
	if st.session_state["contribution_df"].empty:
	st.session_state["contribution_df"] = model_fit(
	features_set=feature_set, target=target
	)[1]
	else:
	st.session_state["contribution_df"] = pd.merge(
	st.session_state["contribution_df"],
	model_fit(features_set=feature_set, target=target)[1],
	)

	# st.write(st.session_state["contribution_df"])

	metrics_table.reset_index(drop=True, inplace=True)

	eda_columns = st.columns(2)
	with eda_columns[1]:
	eda = st.button(
	"Generate EDA Report",
	help="Click to generate a bivariate report for the selected response metric from the table below.",
	)

	# st.markdown('Model Metrics')

	st.title("Contribution Overview")

	contribution_selections = st.multiselect(
	"Select the models to compare contributions",
	[
	col
	for col in st.session_state["contribution_df"].columns
	if col.lower() != "channel"
	],
	default=[
	col
	for col in st.session_state["contribution_df"].columns
	if col.lower() != "channel"
	][-1],
	)
	trace_data = []

	for selection in contribution_selections:

	trace = go.Bar(
	x=st.session_state["contribution_df"]["Channel"],
	y=st.session_state["contribution_df"][selection],
	name=selection,
	text=np.round(st.session_state["contribution_df"][selection], 0)
	.astype(int)
	.astype(str)
	+ "%",
	textposition="outside",
	)
	trace_data.append(trace)

	layout = go.Layout(
	title="Metrics Contribution by Channel",
	xaxis=dict(title="Channel Name"),
	yaxis=dict(title="Metrics Contribution"),
	barmode="group",
	)
	fig = go.Figure(data=trace_data, layout=layout)
	st.plotly_chart(fig, use_container_width=True)

	############################################ Waterfall Chart ############################################
	# import plotly.graph_objects as go

	# # Initialize a Plotly figure
	# fig = go.Figure()

	# for selection in contribution_selections:
	# # Ensure y_values are numeric
	# y_values = st.session_state["contribution_df"][selection].values.astype(float)

	# # Generating text labels for each bar, ensuring operations are compatible with string formats
	# text_values = [f"{val}%" for val in np.round(y_values, 0).astype(int)]

	# fig.add_trace(
	# go.Waterfall(
	# name=selection,
	# orientation="v",
	# measure=["relative"]
	# * len(y_values), # Adjust if you have absolute values at certain points
	# x=st.session_state["contribution_df"]["Channel"].tolist(),
	# text=text_values,
	# textposition="outside",
	# y=y_values,
	# increasing={"marker": {"color": "green"}},
	# decreasing={"marker": {"color": "red"}},
	# totals={"marker": {"color": "blue"}},
	# )
	# )

	# fig.update_layout(
	# title="Metrics Contribution by Channel",
	# xaxis={"title": "Channel Name"},
	# yaxis={"title": "Metrics Contribution"},
	# height=600,
	# )

	# # Displaying the waterfall chart in Streamlit
	# st.plotly_chart(fig, use_container_width=True)

	import plotly.graph_objects as go

	# Initialize a Plotly figure
	fig = go.Figure()

	for selection in contribution_selections:
	# Ensure contributions are numeric
	contributions = (
	st.session_state["contribution_df"][selection].values.astype(float).tolist()
	)
	channel_names = st.session_state["contribution_df"]["Channel"].tolist()

	display_name, display_contribution, base_contribution = [], [], 0
	for channel_name, contribution in zip(channel_names, contributions):
	if channel_name != "const":
	display_name.append(channel_name)
	display_contribution.append(contribution)
	else:
	base_contribution = contribution

	display_name = ["Base Sales"] + display_name
	display_contribution = [base_contribution] + display_contribution

	# Generating text labels for each bar, ensuring operations are compatible with string formats
	text_values = [
	f"{val}%" for val in np.round(display_contribution, 0).astype(int)
	]

	fig.add_trace(
	go.Waterfall(
	orientation="v",
	measure=["relative"]
	* len(
	display_contribution
	), # Adjust if you have absolute values at certain points
	x=display_name,
	text=text_values,
	textposition="outside",
	y=display_contribution,
	increasing={"marker": {"color": "green"}},
	decreasing={"marker": {"color": "red"}},
	totals={"marker": {"color": "blue"}},
	)
	)

	fig.update_layout(
	title="Metrics Contribution by Channel",
	xaxis={"title": "Channel Name"},
	yaxis={"title": "Metrics Contribution"},
	height=600,
	)

	# Displaying the waterfall chart in Streamlit
	st.plotly_chart(fig, use_container_width=True)

	############################################ Waterfall Chart ############################################

	st.title("Analysis of Models Result")
	# st.markdown()
	gd_table = metrics_table.iloc[:, :-2]

	gd = GridOptionsBuilder.from_dataframe(gd_table)
	# gd.configure_pagination(enabled=True)
	gd.configure_selection(
	use_checkbox=True,
	selection_mode="single",
	pre_select_all_rows=False,
	pre_selected_rows=[1],
	)

	gridoptions = gd.build()
	table = AgGrid(
	gd_table, gridOptions=gridoptions, fit_columns_on_grid_load=True, height=200
	)
	# table=metrics_table.iloc[:,:-2]
	# table.insert(0, "Select", False)
	# selection_table=st.data_editor(table,column_config={"Select": st.column_config.CheckboxColumn(required=True)})

	if len(table.selected_rows) == 0:
	st.warning(
	"Click on the checkbox to view comprehensive results of the selected model."
	)
	st.stop()
	else:
	target_column = table.selected_rows[0]["Model"]
	feature_set = feature_set_dct[target_column]

	with eda_columns[1]:
	if eda:

	def generate_report_with_target(channel_data, target_feature):
	report = sv.analyze(
	[channel_data, "Dataset"], target_feat=target_feature, verbose=False
	)
	temp_dir = tempfile.mkdtemp()
	report_path = os.path.join(temp_dir, "report.html")
	report.show_html(
	filepath=report_path, open_browser=False
	) # Generate the report as an HTML file
	return report_path

	report_data = transformed_data[feature_set]
	report_data[target_column] = transformed_data[target_column]
	report_file = generate_report_with_target(report_data, target_column)

	if os.path.exists(report_file):
	with open(report_file, "rb") as f:
	st.download_button(
	label="Download EDA Report",
	data=f.read(),
	file_name="report.html",
	mime="text/html",
	)
	else:
	st.warning("Report generation failed. Unable to find the report file.")

	model = metrics_table[metrics_table["Model"] == target_column]["Model_object"].iloc[
	0
	]
	st.header("Model Summary")
	st.write(model.summary())
	X = transformed_data[feature_set]
	ss = MinMaxScaler()
	X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
	X = sm.add_constant(X)
	y = transformed_data[target_column]
	X_train = X.iloc[:150]
	X_test = X.iloc[150:]
	y_train = y.iloc[:150]
	y_test = y.iloc[150:]
	X.index = transformed_data["date"]
	y.index = transformed_data["date"]

	metrics_table_train, fig_train = plot_actual_vs_predicted(
	X_train.index, y_train, model.predict(X_train), model
	)
	metrics_table_test, fig_test = plot_actual_vs_predicted(
	X_test.index, y_test, model.predict(X_test), model
	)

	metrics_table_train = metrics_table_train.set_index("Metric").transpose()
	metrics_table_train.index = ["Train"]
	metrics_table_test = metrics_table_test.set_index("Metric").transpose()
	metrics_table_test.index = ["test"]
	metrics_table = np.round(pd.concat([metrics_table_train, metrics_table_test]), 2)

	st.markdown("Result Overview")
	st.dataframe(np.round(metrics_table, 2), use_container_width=True)

	st.subheader("Actual vs Predicted Plot Train")

	st.plotly_chart(fig_train, use_container_width=True)
	st.subheader("Actual vs Predicted Plot Test")
	st.plotly_chart(fig_test, use_container_width=True)

	st.markdown("## Residual Analysis")
	columns = st.columns(2)

	Xtrain1 = X_train.copy()
	with columns[0]:
	fig = plot_residual_predicted(y_train, model.predict(Xtrain1), Xtrain1)
	st.plotly_chart(fig)

	with columns[1]:
	st.empty()
	fig = qqplot(y_train, model.predict(X_train))
	st.plotly_chart(fig)

	with columns[0]:
	fig = residual_distribution(y_train, model.predict(X_train))
	st.pyplot(fig)


	elif auth_status == False:
	st.error("Username/Password is incorrect")
	try:
	username_forgot_pw, email_forgot_password, random_password = (
	authenticator.forgot_password("Forgot password")
	)
	if username_forgot_pw:
	st.success("New password sent securely")
	# Random password to be transferred to the user securely
	elif username_forgot_pw == False:
	st.error("Username not found")
	except Exception as e:
	st.error(e)