import plotly.express as px import numpy as np import plotly.graph_objects as go import streamlit as st import pandas as pd import statsmodels.api as sm from sklearn.metrics import mean_absolute_percentage_error import sys import os from utilities import set_header, load_local_css, load_authenticator import seaborn as sns import matplotlib.pyplot as plt import sweetviz as sv import tempfile from sklearn.preprocessing import MinMaxScaler from st_aggrid import AgGrid from st_aggrid import GridOptionsBuilder, GridUpdateMode from st_aggrid import GridOptionsBuilder import sys import re import pickle from sklearn.metrics import r2_score, mean_absolute_percentage_error from Data_prep_functions import plot_actual_vs_predicted import sqlite3 from utilities import update_db sys.setrecursionlimit(10**6) original_stdout = sys.stdout sys.stdout = open("temp_stdout.txt", "w") sys.stdout.close() sys.stdout = original_stdout st.set_page_config(layout="wide") load_local_css("styles.css") set_header() # TODO : ## 1. Add non panel model support ## 2. EDA Function for k, v in st.session_state.items(): if k not in ["logout", "login", "config"] and not k.startswith("FormSubmitter"): st.session_state[k] = v authenticator = st.session_state.get("authenticator") if authenticator is None: authenticator = load_authenticator() name, authentication_status, username = authenticator.login("Login", "main") auth_status = st.session_state.get("authentication_status") if auth_status == True: is_state_initiaized = st.session_state.get("initialized", False) if not is_state_initiaized: if "session_name" not in st.session_state: st.session_state["session_name"] = None if "project_dct" not in st.session_state: st.error("Please load a project from Home page") st.stop() conn = sqlite3.connect( r"DB/User.db", check_same_thread=False ) # connection with sql db c = conn.cursor() if not os.path.exists( os.path.join(st.session_state["project_path"], "tuned_model.pkl") ): st.error("Please save a tuned model") st.stop() if ( "session_state_saved" in st.session_state["project_dct"]["model_tuning"].keys() and st.session_state["project_dct"]["model_tuning"]["session_state_saved"] != [] ): for key in ["used_response_metrics", "media_data", "bin_dict"]: if key not in st.session_state: st.session_state[key] = st.session_state["project_dct"]["model_tuning"][ "session_state_saved" ][key] st.session_state["bin_dict"] = st.session_state["project_dct"][ "model_build" ]["session_state_saved"]["bin_dict"] media_data = st.session_state["media_data"] st.write(media_data.columns) panel_col = [ col.lower() .replace(".", "_") .replace("@", "_") .replace(" ", "_") .replace("-", "") .replace(":", "") .replace("__", "_") for col in st.session_state["bin_dict"]["Panel Level 1"] ][ 0 ] # set the panel column is_panel = True if len(panel_col) > 0 else False date_col = "date" def plot_residual_predicted(actual, predicted, df_): df_["Residuals"] = actual - pd.Series(predicted) df_["StdResidual"] = (df_["Residuals"] - df_["Residuals"].mean()) / df_[ "Residuals" ].std() # Create a Plotly scatter plot fig = px.scatter( df_, x=predicted, y="StdResidual", opacity=0.5, color_discrete_sequence=["#11B6BD"], ) # Add horizontal lines fig.add_hline(y=0, line_dash="dash", line_color="darkorange") fig.add_hline(y=2, line_color="red") fig.add_hline(y=-2, line_color="red") fig.update_xaxes(title="Predicted") fig.update_yaxes(title="Standardized Residuals (Actual - Predicted)") # Set the same width and height for both figures fig.update_layout( title="Residuals over Predicted Values", autosize=False, width=600, height=400, ) return fig def residual_distribution(actual, predicted): Residuals = actual - pd.Series(predicted) # Create a Seaborn distribution plot sns.set(style="whitegrid") plt.figure(figsize=(6, 4)) sns.histplot(Residuals, kde=True, color="#11B6BD") plt.title(" Distribution of Residuals") plt.xlabel("Residuals") plt.ylabel("Probability Density") return plt def qqplot(actual, predicted): Residuals = actual - pd.Series(predicted) Residuals = pd.Series(Residuals) Resud_std = (Residuals - Residuals.mean()) / Residuals.std() # Create a QQ plot using Plotly with custom colors fig = go.Figure() fig.add_trace( go.Scatter( x=sm.ProbPlot(Resud_std).theoretical_quantiles, y=sm.ProbPlot(Resud_std).sample_quantiles, mode="markers", marker=dict(size=5, color="#11B6BD"), name="QQ Plot", ) ) # Add the 45-degree reference line diagonal_line = go.Scatter( x=[ -2, 2, ], # Adjust the x values as needed to fit the range of your data y=[-2, 2], # Adjust the y values accordingly mode="lines", line=dict(color="red"), # Customize the line color and style name=" ", ) fig.add_trace(diagonal_line) # Customize the layout fig.update_layout( title="QQ Plot of Residuals", title_x=0.5, autosize=False, width=600, height=400, xaxis_title="Theoretical Quantiles", yaxis_title="Sample Quantiles", ) return fig def get_random_effects(media_data, panel_col, mdf): random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"]) for i, market in enumerate(media_data[panel_col].unique()): print(i, end="\r") intercept = mdf.random_effects[market].values[0] random_eff_df.loc[i, "random_effect"] = intercept random_eff_df.loc[i, panel_col] = market return random_eff_df def mdf_predict(X_df, mdf, random_eff_df): X = X_df.copy() X = pd.merge( X, random_eff_df[[panel_col, "random_effect"]], on=panel_col, how="left", ) X["pred_fixed_effect"] = mdf.predict(X) X["pred"] = X["pred_fixed_effect"] + X["random_effect"] X.drop(columns=["pred_fixed_effect", "random_effect"], inplace=True) return X def metrics_df_panel(model_dict): metrics_df = pd.DataFrame( columns=[ "Model", "R2", "ADJR2", "Train Mape", "Test Mape", "Summary", "Model_object", ] ) i = 0 for key in model_dict.keys(): target = key.split("__")[1] metrics_df.at[i, "Model"] = target y = model_dict[key]["X_train_tuned"][target] random_df = get_random_effects( media_data, panel_col, model_dict[key]["Model_object"] ) pred = mdf_predict( model_dict[key]["X_train_tuned"], model_dict[key]["Model_object"], random_df, )["pred"] ytest = model_dict[key]["X_test_tuned"][target] predtest = mdf_predict( model_dict[key]["X_test_tuned"], model_dict[key]["Model_object"], random_df, )["pred"] metrics_df.at[i, "R2"] = r2_score(y, pred) metrics_df.at[i, "ADJR2"] = 1 - (1 - metrics_df.loc[i, "R2"]) * ( len(y) - 1 ) / (len(y) - len(model_dict[key]["feature_set"]) - 1) metrics_df.at[i, "Train Mape"] = mean_absolute_percentage_error(y, pred) metrics_df.at[i, "Test Mape"] = mean_absolute_percentage_error( ytest, predtest ) metrics_df.at[i, "Summary"] = model_dict[key]["Model_object"].summary() metrics_df.at[i, "Model_object"] = model_dict[key]["Model_object"] i += 1 metrics_df = np.round(metrics_df, 2) return metrics_df with open( os.path.join(st.session_state["project_path"], "final_df_transformed.pkl"), "rb", ) as f: data = pickle.load(f) transformed_data = data["final_df_transformed"] with open( os.path.join(st.session_state["project_path"], "data_import.pkl"), "rb" ) as f: data = pickle.load(f) st.session_state["bin_dict"] = data["bin_dict"] with open( os.path.join(st.session_state["project_path"], "tuned_model.pkl"), "rb" ) as file: tuned_model_dict = pickle.load(file) feature_set_dct = { key.split("__")[1]: key_dict["feature_set"] for key, key_dict in tuned_model_dict.items() } # """ the above part should be modified so that we are fetching features set from the saved model""" def contributions(X, model, target): X1 = X.copy() for j, col in enumerate(X1.columns): X1[col] = X1[col] * model.params.values[j] contributions = np.round( (X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2 ) contributions = ( pd.DataFrame(contributions, columns=target) .reset_index() .rename(columns={"index": "Channel"}) ) contributions["Channel"] = [ re.split(r"_imp|_cli", col)[0] for col in contributions["Channel"] ] return contributions if "contribution_df" not in st.session_state: st.session_state["contribution_df"] = None def contributions_panel(model_dict): media_data = st.session_state["media_data"] contribution_df = pd.DataFrame(columns=["Channel"]) for key in model_dict.keys(): best_feature_set = model_dict[key]["feature_set"] model = model_dict[key]["Model_object"] target = key.split("__")[1] X_train = model_dict[key]["X_train_tuned"] contri_df = pd.DataFrame() y = [] y_pred = [] random_eff_df = get_random_effects(media_data, panel_col, model) random_eff_df["fixed_effect"] = model.fe_params["Intercept"] random_eff_df["panel_effect"] = ( random_eff_df["random_effect"] + random_eff_df["fixed_effect"] ) coef_df = pd.DataFrame(model.fe_params) coef_df.reset_index(inplace=True) coef_df.columns = ["feature", "coef"] x_train_contribution = X_train.copy() x_train_contribution = mdf_predict( x_train_contribution, model, random_eff_df ) x_train_contribution = pd.merge( x_train_contribution, random_eff_df[[panel_col, "panel_effect"]], on=panel_col, how="left", ) for i in range(len(coef_df))[1:]: coef = coef_df.loc[i, "coef"] col = coef_df.loc[i, "feature"] x_train_contribution[str(col) + "_contr"] = ( coef * x_train_contribution[col] ) # x_train_contribution['sum_contributions'] = x_train_contribution.filter(regex="contr").sum(axis=1) # x_train_contribution['sum_contributions'] = x_train_contribution['sum_contributions'] + x_train_contribution[ # 'panel_effect'] base_cols = ["panel_effect"] + [ c for c in x_train_contribution.filter(regex="contr").columns if c in [ "Week_number_contr", "Trend_contr", "sine_wave_contr", "cosine_wave_contr", ] ] x_train_contribution["base_contr"] = x_train_contribution[base_cols].sum( axis=1 ) x_train_contribution.drop(columns=base_cols, inplace=True) # x_train_contribution.to_csv("Test/smr_x_train_contribution.csv", index=False) contri_df = pd.DataFrame( x_train_contribution.filter(regex="contr").sum(axis=0) ) contri_df.reset_index(inplace=True) contri_df.columns = ["Channel", target] contri_df["Channel"] = ( contri_df["Channel"] .str.split("(_impres|_clicks)") .apply(lambda c: c[0]) ) contri_df[target] = 100 * contri_df[target] / contri_df[target].sum() contri_df["Channel"].replace("base_contr", "base", inplace=True) contribution_df = pd.merge( contribution_df, contri_df, on="Channel", how="outer" ) # st.session_state["contribution_df"] = contributions_panel(tuned_model_dict) return contribution_df metrics_table = metrics_df_panel(tuned_model_dict) st.title("AI Model Results") st.header('Contribution Overview') options = st.session_state["used_response_metrics"] st.write(options) options = [ opt.lower() .replace(" ", "_") .replace("-", "") .replace(":", "") .replace("__", "_") for opt in options ] default_options = ( st.session_state["project_dct"]["saved_model_results"].get("selected_options") if st.session_state["project_dct"]["saved_model_results"].get( "selected_options" ) is not None else [options[-1]] ) for i in default_options: if i not in options: st.write(i) default_options.remove(i) def format_display(inp): return inp.title().replace("_", " ").strip() contribution_selections = st.multiselect( "Select the Response Metrics to compare contributions", options, default=default_options, format_func=format_display, ) trace_data = [] st.session_state["contribution_df"] = contributions_panel(tuned_model_dict) st.write(st.session_state["contribution_df"].columns) # for selection in contribution_selections: # trace = go.Bar( # x=st.session_state["contribution_df"]["Channel"], # y=st.session_state["contribution_df"][selection], # name=selection, # text=np.round(st.session_state["contribution_df"][selection], 0) # .astype(int) # .astype(str) # + "%", # textposition="outside", # ) # trace_data.append(trace) # layout = go.Layout( # title="Metrics Contribution by Channel", # xaxis=dict(title="Channel Name"), # yaxis=dict(title="Metrics Contribution"), # barmode="group", # ) # fig = go.Figure(data=trace_data, layout=layout) # st.plotly_chart(fig, use_container_width=True) def create_grouped_bar_plot(contribution_df, contribution_selections): # Extract the 'Channel' names channel_names = contribution_df["Channel"].tolist() # Dictionary to store all contributions except 'const' and 'base' all_contributions = { name: [] for name in channel_names if name not in ["const", "base"] } # Dictionary to store base sales for each selection base_sales_dict = {} # Accumulate contributions for each channel from each selection for selection in contribution_selections: contributions = contribution_df[selection].values.astype(float) base_sales = 0 # Initialize base sales for the current selection for channel_name, contribution in zip(channel_names, contributions): if channel_name in all_contributions: all_contributions[channel_name].append(contribution) elif channel_name == "base": base_sales = ( contribution # Capture base sales for the current selection ) # Store base sales for each selection base_sales_dict[selection] = base_sales # Calculate the average of contributions and sort by this average sorted_channels = sorted( all_contributions.items(), key=lambda x: -np.mean(x[1]) ) sorted_channel_names = [name for name, _ in sorted_channels] sorted_channel_names = [ "Base Sales" ] + sorted_channel_names # Adding 'Base Sales' at the start trace_data = [] max_value = ( 0 # Initialize max_value to find the highest bar for y-axis adjustment ) # Create traces for the grouped bar chart for selection in contribution_selections: display_name = sorted_channel_names display_contribution = [base_sales_dict[selection]] + [ np.mean(all_contributions[name]) for name in sorted_channel_names[1:] ] # Start with base sales for the current selection # Generating text labels for each bar text_values = [ f"{val}%" for val in np.round(display_contribution, 0).astype(int) ] # Find the max value for y-axis calculation max_contribution = max(display_contribution) if max_contribution > max_value: max_value = max_contribution # Create a bar trace for each selection trace = go.Bar( x=display_name, y=display_contribution, name=selection, text=text_values, textposition="outside", ) trace_data.append(trace) # Define layout for the bar chart layout = go.Layout( title="Metrics Contribution by Channel", xaxis=dict(title="Channel Name"), yaxis=dict( title="Metrics Contribution", range=[0, max_value * 1.2] ), # Set y-axis 20% higher than the max bar barmode="group", plot_bgcolor="white", ) # Create the figure with trace data and layout fig = go.Figure(data=trace_data, layout=layout) return fig # Display the chart in Streamlit st.plotly_chart( create_grouped_bar_plot( st.session_state["contribution_df"], contribution_selections ), use_container_width=True, ) ############################################ Waterfall Chart ############################################ import plotly.graph_objects as go # # Initialize a Plotly figure # fig = go.Figure() # for selection in contribution_selections: # # Ensure contributions are numeric # contributions = ( # st.session_state["contribution_df"][selection].values.astype(float).tolist() # ) # channel_names = st.session_state["contribution_df"]["Channel"].tolist() # display_name, display_contribution, base_contribution = [], [], 0 # for channel_name, contribution in zip(channel_names, contributions): # if channel_name != "const" and channel_name != "base": # display_name.append(channel_name) # display_contribution.append(contribution) # else: # base_contribution = contribution # display_name = ["Base Sales"] + display_name # display_contribution = [base_contribution] + display_contribution # # Generating text labels for each bar, ensuring operations are compatible with string formats # text_values = [ # f"{val}%" for val in np.round(display_contribution, 0).astype(int) # ] # fig.add_trace( # go.Waterfall( # orientation="v", # measure=["relative"] * len(display_contribution), # x=display_name, # text=text_values, # textposition="outside", # y=display_contribution, # increasing={"marker": {"color": "green"}}, # decreasing={"marker": {"color": "red"}}, # totals={"marker": {"color": "blue"}}, # name=selection, # ) # ) # fig.update_layout( # title="Metrics Contribution by Channel", # xaxis={"title": "Channel Name"}, # yaxis={"title": "Metrics Contribution"}, # height=600, # ) # # Displaying the waterfall chart in Streamlit # st.plotly_chart(fig, use_container_width=True) def preprocess_and_plot(contribution_df, contribution_selections): # Extract the 'Channel' names channel_names = contribution_df["Channel"].tolist() # Dictionary to store all contributions except 'const' and 'base' all_contributions = { name: [] for name in channel_names if name not in ["const", "base"] } # Dictionary to store base sales for each selection base_sales_dict = {} # Accumulate contributions for each channel from each selection for selection in contribution_selections: contributions = contribution_df[selection].values.astype(float) base_sales = 0 # Initialize base sales for the current selection for channel_name, contribution in zip(channel_names, contributions): if channel_name in all_contributions: all_contributions[channel_name].append(contribution) elif channel_name == "base": base_sales = ( contribution # Capture base sales for the current selection ) # Store base sales for each selection base_sales_dict[selection] = base_sales # Calculate the average of contributions and sort by this average sorted_channels = sorted( all_contributions.items(), key=lambda x: -np.mean(x[1]) ) sorted_channel_names = [name for name, _ in sorted_channels] sorted_channel_names = [ "Base Sales" ] + sorted_channel_names # Adding 'Base Sales' at the start # Initialize a Plotly figure fig = go.Figure() for selection in contribution_selections: display_name = ["Base Sales"] + sorted_channel_names[ 1: ] # Channel names for the plot display_contribution = [ base_sales_dict[selection] ] # Start with base sales for the current selection # Append average contributions for other channels for name in sorted_channel_names[1:]: display_contribution.append(np.mean(all_contributions[name])) # Generating text labels for each bar text_values = [ f"{val}%" for val in np.round(display_contribution, 0).astype(int) ] # Add a waterfall trace for each selection fig.add_trace( go.Waterfall( orientation="v", measure=["relative"] * len(display_contribution), x=display_name, text=text_values, textposition="outside", y=display_contribution, increasing={"marker": {"color": "green"}}, decreasing={"marker": {"color": "red"}}, totals={"marker": {"color": "blue"}}, name=selection, ) ) # Update layout of the figure fig.update_layout( title="Metrics Contribution by Channel", xaxis={"title": "Channel Name"}, yaxis=dict(title="Metrics Contribution", range=[0, 100 * 1.2]), ) return fig # Displaying the waterfall chart st.plotly_chart( preprocess_and_plot( st.session_state["contribution_df"], contribution_selections ), use_container_width=True, ) ############################################ Waterfall Chart ############################################ st.header("Analysis of Models Result") # st.markdown() previous_selection = st.session_state["project_dct"]["saved_model_results"].get( "model_grid_sel", [1] ) # st.write(np.round(metrics_table, 2)) gd_table = metrics_table.iloc[:, :-2] gd = GridOptionsBuilder.from_dataframe(gd_table) # gd.configure_pagination(enabled=True) gd.configure_selection( use_checkbox=True, selection_mode="single", pre_select_all_rows=False, pre_selected_rows=previous_selection, ) gridoptions = gd.build() table = AgGrid( gd_table, gridOptions=gridoptions, fit_columns_on_grid_load=True, height=200, ) # table=metrics_table.iloc[:,:-2] # table.insert(0, "Select", False) # selection_table=st.data_editor(table,column_config={"Select": st.column_config.CheckboxColumn(required=True)}) if len(table.selected_rows) > 0: st.session_state["project_dct"]["saved_model_results"]["model_grid_sel"] = ( table.selected_rows[0]["_selectedRowNodeInfo"]["nodeRowIndex"] ) if len(table.selected_rows) == 0: st.warning( "Click on the checkbox to view comprehensive results of the selected model." ) st.stop() else: target_column = table.selected_rows[0]["Model"] feature_set = feature_set_dct[target_column] model = metrics_table[metrics_table["Model"] == target_column]["Model_object"].iloc[ 0 ] target = metrics_table[metrics_table["Model"] == target_column]["Model"].iloc[0] st.header("Model Summary") st.write(model.summary()) sel_dict = tuned_model_dict[ [k for k in tuned_model_dict.keys() if k.split("__")[1] == target][0] ] X_train = sel_dict["X_train_tuned"] y_train = X_train[target] random_effects = get_random_effects(media_data, panel_col, model) pred = mdf_predict(X_train, model, random_effects)["pred"] X_test = sel_dict["X_test_tuned"] y_test = X_test[target] predtest = mdf_predict(X_test, model, random_effects)["pred"] metrics_table_train, _, fig_train = plot_actual_vs_predicted( X_train[date_col], y_train, pred, model, target_column=target_column, flag=None, repeat_all_years=False, is_panel=is_panel, ) metrics_table_test, _, fig_test = plot_actual_vs_predicted( X_test[date_col], y_test, predtest, model, target_column=target_column, flag=None, repeat_all_years=False, is_panel=is_panel, ) metrics_table_train = metrics_table_train.set_index("Metric").transpose() metrics_table_train.index = ["Train"] metrics_table_test = metrics_table_test.set_index("Metric").transpose() metrics_table_test.index = ["Test"] metrics_table = np.round(pd.concat([metrics_table_train, metrics_table_test]), 2) st.markdown("Result Overview") st.dataframe(np.round(metrics_table, 2), use_container_width=True) st.subheader("Actual vs Predicted Plot Train") st.plotly_chart(fig_train, use_container_width=True) st.subheader("Actual vs Predicted Plot Test") st.plotly_chart(fig_test, use_container_width=True) st.markdown("## Residual Analysis") columns = st.columns(2) Xtrain1 = X_train.copy() with columns[0]: fig = plot_residual_predicted(y_train, model.predict(Xtrain1), Xtrain1) st.plotly_chart(fig) with columns[1]: st.empty() fig = qqplot(y_train, model.predict(X_train)) st.plotly_chart(fig) with columns[0]: fig = residual_distribution(y_train, model.predict(X_train)) st.pyplot(fig) update_db("6_AI_Model_Result.py") elif auth_status == False: st.error("Username/Password is incorrect") try: username_forgot_pw, email_forgot_password, random_password = ( authenticator.forgot_password("Forgot password") ) if username_forgot_pw: st.success("New password sent securely") # Random password to be transferred to the user securely elif username_forgot_pw == False: st.error("Username not found") except Exception as e: st.error(e)