Spaces:

BlendMMM
/

Mastercard

Sleeping

File size: 36,740 Bytes

bd80083

'''
MMO Build Sprint 3
additions : adding more variables to session state for saved model : random effect, predicted train & test

MMO Build Sprint 4
additions : ability to run models for different response metrics
'''

import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from Eda_functions import format_numbers
import numpy as np
import pickle
from st_aggrid import AgGrid
from st_aggrid import GridOptionsBuilder, GridUpdateMode
from utilities import set_header, load_local_css
from st_aggrid import GridOptionsBuilder
import time
import itertools
import statsmodels.api as sm
import numpy as npc
import re
import itertools
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler
import os
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor

st.set_option('deprecation.showPyplotGlobalUse', False)
import statsmodels.api as sm
import statsmodels.formula.api as smf

from datetime import datetime
import seaborn as sns
from Data_prep_functions import *



def get_random_effects(media_data, panel_col, mdf):
    random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"])

    for i, market in enumerate(media_data[panel_col].unique()):
        print(i, end='\r')
        intercept = mdf.random_effects[market].values[0]
        random_eff_df.loc[i, 'random_effect'] = intercept
        random_eff_df.loc[i, panel_col] = market

    return random_eff_df


def mdf_predict(X_df, mdf, random_eff_df):
    X = X_df.copy()
    X['fixed_effect'] = mdf.predict(X)
    X = pd.merge(X, random_eff_df, on=panel_col, how='left')
    X['pred'] = X['fixed_effect'] + X['random_effect']
    # X.to_csv('Test/megred_df.csv',index=False)
    X.drop(columns=['fixed_effect', 'random_effect'], inplace=True)
    return X['pred']


st.set_page_config(
    page_title="Model Build",
    page_icon=":shark:",
    layout="wide",
    initial_sidebar_state='collapsed'
)

load_local_css('styles.css')
set_header()

st.title('1. Build Your Model')

with open("data_import.pkl", "rb") as f:
    data = pickle.load(f)

    st.session_state['bin_dict'] = data["bin_dict"]

#st.write(data["bin_dict"])

with open("final_df_transformed.pkl", "rb") as f:
    data = pickle.load(f)

# Accessing the loaded objects
    media_data = data["final_df_transformed"]

# Sprint4 - available response metrics is a list of all reponse metrics in the data
## these will be put in a drop down

    st.session_state['media_data']=media_data

if 'available_response_metrics' not in st.session_state:
    # st.session_state['available_response_metrics'] = ['Total Approved Accounts - Revenue',
    #                                                   'Total Approved Accounts - Appsflyer',
    #                                                   'Account Requests - Appsflyer',
    #                                                   'App Installs - Appsflyer']

    st.session_state['available_response_metrics']= st.session_state['bin_dict']["Response Metrics"]
# Sprint4
if "is_tuned_model" not in st.session_state:
    st.session_state["is_tuned_model"] = {}
for resp_metric in st.session_state['available_response_metrics'] :
    resp_metric=resp_metric.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")
    st.session_state["is_tuned_model"][resp_metric] = False

# Sprint4 - used_response_metrics is a list of resp metrics for which user has created & saved a model
if 'used_response_metrics' not in st.session_state:
    st.session_state['used_response_metrics'] = []

# Sprint4 - saved_model_names
if 'saved_model_names' not in st.session_state:
    st.session_state['saved_model_names'] = []

# if "model_save_flag" not in st.session_state:
#     st.session_state["model_save_flag"]=False
# def reset_save():
#     st.session_state["model_save_flag"]=False
# def set_save():
#     st.session_state["model_save_flag"]=True
# Sprint4 - select a response metric


sel_target_col = st.selectbox("Select the response metric",
                              st.session_state['available_response_metrics']) 
 # , on_change=reset_save())
target_col = sel_target_col.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")

new_name_dct={col:col.lower().replace('.','_').lower().replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in media_data.columns}

media_data.columns=[col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in media_data.columns]

#st.write(st.session_state['bin_dict'])
panel_col = [col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in  st.session_state['bin_dict']['Panel Level 1']  ] [0]# set the panel column
date_col = 'date'

#st.write(media_data)

is_panel = True if len(panel_col)>0 else False

if 'is_panel' not in st.session_state:
    st.session_state['is_panel']=False



# if st.toggle('Apply Transformations on DMA/Panel Level'):
#     media_data = pd.read_csv(r'C:\Users\SrishtiVerma\Mastercard\Sprint2\upf_data_converted_randomized_resp_metrics.csv')
#     media_data.columns = [i.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for i in
#                           media_data.columns]
#     dma = st.selectbox('Select the Level of data ',
#                        [col for col in media_data.columns if col.lower() in ['dma', 'panel', 'markets']])
#     # is_panel = True
#     # st.session_state['is_panel']=True
#
# else:
#     # """ code to aggregate data on date """
#     media_data = pd.read_excel(r'C:\Users\SrishtiVerma\Mastercard\Sprint1\Tactic Level Models\Tactic_level_data_imp_clicks_spends.xlsx')
#     media_data.columns = [i.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for i in
#                           media_data.columns]
#     dma = None
#     # is_panel = False
#     # st.session_state['is_panel']=False

#media_data = st.session_state["final_df"]



# st.write(media_data.columns) 

media_data.sort_values(date_col, inplace=True)
media_data.reset_index(drop=True, inplace=True)

date = media_data[date_col]
st.session_state['date'] = date
# revenue=media_data[target_col]
y = media_data[target_col]

if is_panel:
    spends_data = media_data[
        [c for c in media_data.columns if "_cost" in c.lower() or "_spend" in c.lower()] + [date_col, panel_col]]
    # Sprint3 - spends for resp curves
else:
    spends_data = media_data[
        [c for c in media_data.columns if "_cost" in c.lower() or "_spend" in c.lower()] + [date_col]]

y = media_data[target_col]
# media_data.drop([target_col],axis=1,inplace=True)
media_data.drop([date_col], axis=1, inplace=True)
media_data.reset_index(drop=True, inplace=True)

# dma_dict={ dm:media_data[media_data[dma]==dm] for dm in media_data[dma].unique()}

# st.markdown('## Select the Range of Transformations')
columns = st.columns(2)

old_shape = media_data.shape

if "old_shape" not in st.session_state:
    st.session_state['old_shape'] = old_shape

# with columns[0]:
#     slider_value_adstock = st.slider('Select Adstock Range (only applied to media)', 0.0, 1.0, (0.2, 0.4), step=0.1,
#                                      format="%.2f")
# with columns[1]:
#     slider_value_lag = st.slider('Select Lag Range (applied to media, seasonal, macroeconomic variables)', 1, 7, (1, 3),
#                                  step=1)


# with columns[2]:
#    slider_value_power=st.slider('Select Power range (only applied to media )',0,4,(1,2),step=1)

# with columns[1]:
#    st.number_input('Select the range of half saturation point ',min_value=1,max_value=5)
#    st.number_input('Select the range of  ')

# Section 1 - Transformations Functions
# def lag(data, features, lags, dma=None):
#     if dma:
#
#         transformed_data = pd.concat(
#             [data.groupby([dma])[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags], axis=1)
#         # transformed_data = transformed_data.fillna(method='bfill')
#         transformed_data = transformed_data.bfill() # Sprint4 - fillna getting deprecated
#         return pd.concat([transformed_data, data], axis=1)
#
#     else:
#
#         # ''' data should be aggregated on date'''
#
#         transformed_data = pd.concat([data[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags], axis=1)
#         # transformed_data = transformed_data.fillna(method='bfill')
#         transformed_data = transformed_data.bfill()
#
#         return pd.concat([transformed_data, data], axis=1)
#
#
# # adstock
# def adstock(df, alphas, cutoff, features, dma=None):
#     if dma:
#         transformed_data = pd.DataFrame()
#         for d in df[dma].unique():
#             dma_sub_df = df[df[dma] == d]
#             n = len(dma_sub_df)
#
#             weights = np.array(
#                 [[[alpha ** (i - j) if i >= j and j >= i - cutoff else 0. for j in range(n)] for i in range(n)] for
#                  alpha in alphas])
#             X = dma_sub_df[features].to_numpy()
#
#             res = pd.DataFrame(np.hstack(weights @ X),
#                                columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features])
#
#             transformed_data = pd.concat([transformed_data, res], axis=0)
#             transformed_data.reset_index(drop=True, inplace=True)
#         return pd.concat([transformed_data, df], axis=1)
#
#     else:
#
#         n = len(df)
#
#         weights = np.array(
#             [[[alpha ** (i - j) if i >= j and j >= i - cutoff else 0. for j in range(n)] for i in range(n)] for alpha in
#              alphas])
#
#         X = df[features].to_numpy()
#         res = pd.DataFrame(np.hstack(weights @ X),
#                            columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features])
#         return pd.concat([res, df], axis=1)


# Section 2 - Begin Transformations

if 'media_data' not in st.session_state:
    st.session_state['media_data'] = pd.DataFrame()

# Sprint3
if "orig_media_data" not in st.session_state:
    st.session_state['orig_media_data'] = pd.DataFrame()

# Sprint3 additions
if 'random_effects' not in st.session_state:
    st.session_state['random_effects'] = pd.DataFrame()
if 'pred_train' not in st.session_state:
    st.session_state['pred_train'] = []
if 'pred_test' not in st.session_state:
    st.session_state['pred_test'] = []
# end of Sprint3 additions

# variables_to_be_transformed=[col for col in media_data.columns if col.lower() not in ['dma','panel'] ] # change for buckets
# variables_to_be_transformed = [col for col in media_data.columns if
#                                '_clicks' in col.lower() or '_impress' in col.lower()]  # srishti - change
#
# with columns[0]:
#     if st.button('Apply Transformations'):
#         with st.spinner('Applying Transformations'):
#             transformed_data_lag = lag(media_data, features=variables_to_be_transformed,
#                                        lags=np.arange(slider_value_lag[0], slider_value_lag[1] + 1, 1), dma=dma)
#
#             # variables_to_be_transformed=[col for col in list(transformed_data_lag.columns) if col not in ['Date','DMA','Panel']] #change for buckets
#             variables_to_be_transformed = [col for col in media_data.columns if
#                                            '_clicks' in col.lower() or '_impress' in col.lower()]  # srishti - change
#
#             transformed_data_adstock = adstock(df=transformed_data_lag,
#                                                alphas=np.arange(slider_value_adstock[0], slider_value_adstock[1], 0.1),
#                                                cutoff=8, features=variables_to_be_transformed, dma=dma)
#
#             # st.success('Done')
#             st.success("Transformations complete!")
#
#             st.write(f'old shape {old_shape}, new shape {transformed_data_adstock.shape}')
#
#             transformed_data_adstock.columns = [c.replace(".", "_") for c in
#                                                 transformed_data_adstock.columns]  # srishti
#             st.session_state['media_data'] = transformed_data_adstock  # srishti
#             # Sprint3
#             orig_media_data = media_data.copy()
#             orig_media_data[date_col] = date
#             orig_media_data[target_col] = y
#             st.session_state['orig_media_data'] = orig_media_data  # srishti
#
#         # with st.spinner('Applying Transformations'):
#         #   time.sleep(2)
#         #   st.success("Transformations complete!")
#
# # if st.session_state['media_data'].shape[1]>old_shape[1]:
# # with columns[0]:
# # st.write(f'Total no.of variables before transformation: {old_shape[1]}, Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
# # st.write(f'Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')

# Section 3 - Create combinations

# bucket=['paid_search', 'kwai','indicacao','infleux', 'influencer','FB: Level Achieved - Tier 1 Impressions',
#       ' FB: Level Achieved - Tier 2 Impressions','paid_social_others',
#         ' GA App: Will And Cid Pequena Baixo Risco Clicks',
#       'digital_tactic_others',"programmatic"
#       ]

# srishti - bucket names changed
bucket = ['paid_search', 'kwai', 'indicacao', 'infleux', 'influencer', 'fb_level_achieved_tier_2',
          'fb_level_achieved_tier_1', 'paid_social_others',
          'ga_app',
          'digital_tactic_others', "programmatic"
          ]

with columns[0]:
    if st.button('Create Combinations of Variables'):

        top_3_correlated_features = []
        # # for col in st.session_state['media_data'].columns[:19]:
        # original_cols = [c for c in st.session_state['media_data'].columns if
        #                  "_clicks" in c.lower() or "_impressions" in c.lower()]
        #original_cols = [c for c in original_cols if "_lag" not in c.lower() and "_adstock" not in c.lower()]

        original_cols=st.session_state['bin_dict']['Media'] + st.session_state['bin_dict']['Internal']

        original_cols=[col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in original_cols]

        #st.write(original_cols)
        # for col in st.session_state['media_data'].columns[:19]:
        for col in original_cols:  # srishti - new
            corr_df = pd.concat([st.session_state['media_data'].filter(regex=col),
                                 y], axis=1).corr()[target_col].iloc[:-1]
            top_3_correlated_features.append(list(corr_df.sort_values(ascending=False).head(2).index))
        flattened_list = [item for sublist in top_3_correlated_features for item in sublist]
        # all_features_set={var:[col for col in flattened_list if var in col] for var in bucket}
        all_features_set = {var: [col for col in flattened_list if var in col] for var in bucket if
                            len([col for col in flattened_list if var in col]) > 0}  # srishti

        channels_all = [values for values in all_features_set.values()]
        st.session_state['combinations'] = list(itertools.product(*channels_all))
        # if 'combinations' not in st.session_state:
        #   st.session_state['combinations']=combinations_all

        st.session_state['final_selection'] = st.session_state['combinations']
        st.success('Done')

        # revenue.reset_index(drop=True,inplace=True)
    y.reset_index(drop=True, inplace=True)
    if 'Model_results' not in st.session_state:
        st.session_state['Model_results'] = {'Model_object': [],
                                             'Model_iteration': [],
                                             'Feature_set': [],
                                             'MAPE': [],
                                             'R2': [],
                                             'ADJR2': [],
                                             'pos_count': []
                                             }


    def reset_model_result_dct():
        st.session_state['Model_results'] = {'Model_object': [],
                                             'Model_iteration': [],
                                             'Feature_set': [],
                                             'MAPE': [],
                                             'R2': [],
                                             'ADJR2': [],
                                             'pos_count': []
                                             }

        # if st.button('Build Model'):


    if 'iterations' not in st.session_state:
        st.session_state['iterations'] = 0

    if 'final_selection' not in st.session_state:
        st.session_state['final_selection'] = False

save_path = r"Model/"
with columns[1]:
    if st.session_state['final_selection']:
        st.write(f'Total combinations created {format_numbers(len(st.session_state["final_selection"]))}')

if st.checkbox('Build all iterations'):
    iterations = len(st.session_state['final_selection'])
else:
    iterations = st.number_input('Select the number of iterations to perform', min_value=0, step=100,
                                 value=st.session_state['iterations'], on_change=reset_model_result_dct)
#  st.write("iterations=", iterations)


if st.button('Build Model', on_click=reset_model_result_dct):
    st.session_state['iterations'] = iterations

    # Section 4 - Model
    # st.session_state['media_data'] = st.session_state['media_data'].fillna(method='ffill')
    st.session_state['media_data'] = st.session_state['media_data'].ffill()
    st.markdown(
        'Data Split -- Training Period: May 9th, 2023 - October 5th,2023 , Testing Period: October 6th, 2023 - November 7th, 2023 ')
    progress_bar = st.progress(0)  # Initialize the progress bar
    # time_remaining_text = st.empty()  # Create an empty space for time remaining text
    start_time = time.time()  # Record the start time
    progress_text = st.empty()

    # time_elapsed_text = st.empty()
    # for i, selected_features in enumerate(st.session_state["final_selection"][40000:40000 + int(iterations)]):
    # st.write(st.session_state["final_selection"])
    # for i, selected_features in enumerate(st.session_state["final_selection"]):

    if is_panel == True:
        for i, selected_features in enumerate(st.session_state["final_selection"][0:int(iterations)]):  # srishti
            df = st.session_state['media_data']

            fet = [var for var in selected_features if len(var) > 0]
            inp_vars_str = " + ".join(fet)  # new

            X = df[fet]
            y = df[target_col]
            ss = MinMaxScaler()
            X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)

            X[target_col] = y  # Sprint2
            X[panel_col] = df[panel_col]  # Sprint2

            X_train = X.iloc[:8000]
            X_test = X.iloc[8000:]
            y_train = y.iloc[:8000]
            y_test = y.iloc[8000:]

            print(X_train.shape)
            # model = sm.OLS(y_train, X_train).fit()
            md_str = target_col + " ~ " + inp_vars_str
            # md = smf.mixedlm("total_approved_accounts_revenue ~ {}".format(inp_vars_str),
            #                 data=X_train[[target_col] + fet],
            #                 groups=X_train[panel_col])
            md = smf.mixedlm(md_str,
                             data=X_train[[target_col] + fet],
                             groups=X_train[panel_col])
            mdf = md.fit()
            predicted_values = mdf.fittedvalues

            coefficients = mdf.fe_params.to_dict()
            model_positive = [col for col in coefficients.keys() if coefficients[col] > 0]

            pvalues = [var for var in list(mdf.pvalues) if var <= 0.06]

            if (len(model_positive) / len(selected_features)) > 0 and (
                    len(pvalues) / len(selected_features)) >= 0:  # srishti - changed just for testing, revert later
                # predicted_values = model.predict(X_train)
                mape = mean_absolute_percentage_error(y_train, predicted_values)
                r2 = r2_score(y_train, predicted_values)
                adjr2 = 1 - (1 - r2) * (len(y_train) - 1) / (len(y_train) - len(selected_features) - 1)

                filename = os.path.join(save_path, f"model_{i}.pkl")
                with open(filename, "wb") as f:
                    pickle.dump(mdf, f)
                # with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file:
                #   model = pickle.load(file)

                st.session_state['Model_results']['Model_object'].append(filename)
                st.session_state['Model_results']['Model_iteration'].append(i)
                st.session_state['Model_results']['Feature_set'].append(fet)
                st.session_state['Model_results']['MAPE'].append(mape)
                st.session_state['Model_results']['R2'].append(r2)
                st.session_state['Model_results']['pos_count'].append(len(model_positive))
                st.session_state['Model_results']['ADJR2'].append(adjr2)

            current_time = time.time()
            time_taken = current_time - start_time
            time_elapsed_minutes = time_taken / 60
            completed_iterations_text = f"{i + 1}/{iterations}"
            progress_bar.progress((i + 1) / int(iterations))
            progress_text.text(
                f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}')
        st.write(
            f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models')

    else:

        for i, selected_features in enumerate(st.session_state["final_selection"][0:int(iterations)]):  # srishti
            df = st.session_state['media_data']

            fet = [var for var in selected_features if len(var) > 0]
            inp_vars_str = " + ".join(fet)

            X = df[fet]
            y = df[target_col]
            ss = MinMaxScaler()
            X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
            X = sm.add_constant(X)
            X_train = X.iloc[:130]
            X_test = X.iloc[130:]
            y_train = y.iloc[:130]
            y_test = y.iloc[130:]

            model = sm.OLS(y_train, X_train).fit()


            coefficients = model.params.to_list()
            model_positive = [coef for coef in coefficients if coef > 0]
            predicted_values = model.predict(X_train)
            pvalues = [var for var in list(model.pvalues) if var <= 0.06]

            # if (len(model_possitive) / len(selected_features)) > 0.9 and (len(pvalues) / len(selected_features)) >= 0.8:
            if (len(model_positive) / len(selected_features)) > 0 and (len(pvalues) / len(
                    selected_features)) >= 0.5:  # srishti - changed just for testing, revert later VALID MODEL CRITERIA
                # predicted_values = model.predict(X_train)
                mape = mean_absolute_percentage_error(y_train, predicted_values)
                adjr2 = model.rsquared_adj
                r2 = model.rsquared

                filename = os.path.join(save_path, f"model_{i}.pkl")
                with open(filename, "wb") as f:
                    pickle.dump(model, f)
                # with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file:
                #   model = pickle.load(file)

                st.session_state['Model_results']['Model_object'].append(filename)
                st.session_state['Model_results']['Model_iteration'].append(i)
                st.session_state['Model_results']['Feature_set'].append(fet)
                st.session_state['Model_results']['MAPE'].append(mape)
                st.session_state['Model_results']['R2'].append(r2)
                st.session_state['Model_results']['ADJR2'].append(adjr2)
                st.session_state['Model_results']['pos_count'].append(len(model_positive))

            current_time = time.time()
            time_taken = current_time - start_time
            time_elapsed_minutes = time_taken / 60
            completed_iterations_text = f"{i + 1}/{iterations}"
            progress_bar.progress((i + 1) / int(iterations))
            progress_text.text(
                f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}')
        st.write(
            f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models')

    pd.DataFrame(st.session_state['Model_results']).to_csv('model_output.csv')


    def to_percentage(value):
        return f'{value * 100:.1f}%'

## Section 5 - Select Model
st.title('2. Select Models')
if 'tick' not in st.session_state:
    st.session_state['tick'] = False
if st.checkbox('Show results of top 10 models (based on MAPE and Adj. R2)', value=st.session_state['tick']):
    st.session_state['tick'] = True
    st.write('Select one model iteration to generate performance metrics for it:')
    data = pd.DataFrame(st.session_state['Model_results'])
    data = data[data['pos_count']==data['pos_count'].max()].reset_index(drop=True) # Sprint4 -- Srishti -- only show models with the lowest num of neg coeffs
    data.sort_values(by=['ADJR2'], ascending=False, inplace=True)
    data.drop_duplicates(subset='Model_iteration', inplace=True)
    top_10 = data.head(10)
    top_10['Rank'] = np.arange(1, len(top_10) + 1, 1)
    top_10[['MAPE', 'R2', 'ADJR2']] = np.round(top_10[['MAPE', 'R2', 'ADJR2']], 4).applymap(to_percentage)
    top_10_table = top_10[['Rank', 'Model_iteration', 'MAPE', 'ADJR2', 'R2']]
    # top_10_table.columns=[['Rank','Model Iteration Index','MAPE','Adjusted R2','R2']]
    gd = GridOptionsBuilder.from_dataframe(top_10_table)
    gd.configure_pagination(enabled=True)
    
    gd.configure_selection(
        use_checkbox=True,
        selection_mode="single",
        pre_select_all_rows=False,
        pre_selected_rows=[1],
    )

    gridoptions = gd.build()

    table = AgGrid(top_10, gridOptions=gridoptions, update_mode=GridUpdateMode.SELECTION_CHANGED)

    selected_rows = table.selected_rows
    # if st.session_state["selected_rows"] != selected_rows:
    #   st.session_state["build_rc_cb"] = False
    st.session_state["selected_rows"] = selected_rows
    if 'Model' not in st.session_state:
        st.session_state['Model'] = {}

    # Section 6 - Display Results

    if len(selected_rows) > 0:
        st.header('2.1 Results Summary')

        model_object = data[data['Model_iteration'] == selected_rows[0]['Model_iteration']]['Model_object']
        features_set = data[data['Model_iteration'] == selected_rows[0]['Model_iteration']]['Feature_set']

        with open(str(model_object.values[0]), 'rb') as file:
            # print(file)
            model = pickle.load(file)
        st.write(model.summary())
        st.header('2.2 Actual vs. Predicted Plot')

        if is_panel :
            df = st.session_state['media_data']
            X = df[features_set.values[0]]
            y = df[target_col]

            ss = MinMaxScaler()
            X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)

            # Sprint2 changes
            X[target_col] = y  # new
            X[panel_col] = df[panel_col]
            X[date_col] = date

            X_train = X.iloc[:8000]
            X_test = X.iloc[8000:].reset_index(drop=True)
            y_train = y.iloc[:8000]
            y_test = y.iloc[8000:].reset_index(drop=True)

            test_spends = spends_data[8000:]  # Sprint3 - test spends for resp curves
            random_eff_df = get_random_effects(media_data, panel_col, model)
            train_pred = model.fittedvalues
            test_pred = mdf_predict(X_test, model, random_eff_df)
            print("__" * 20, test_pred.isna().sum())

        else :
            df = st.session_state['media_data']
            X = df[features_set.values[0]]
            y = df[target_col]

            ss = MinMaxScaler()
            X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
            X = sm.add_constant(X)

            X[date_col] = date

            X_train = X.iloc[:130]
            X_test = X.iloc[130:].reset_index(drop=True)
            y_train = y.iloc[:130]
            y_test = y.iloc[130:].reset_index(drop=True)

            test_spends = spends_data[130:]  # Sprint3 - test spends for resp curves
            train_pred = model.predict(X_train[features_set.values[0]+['const']])
            test_pred = model.predict(X_test[features_set.values[0]+['const']])


        # save x test to test - srishti
        x_test_to_save = X_test.copy()
        x_test_to_save['Actuals'] = y_test
        x_test_to_save['Predictions'] = test_pred

        x_train_to_save = X_train.copy()
        x_train_to_save['Actuals'] = y_train
        x_train_to_save['Predictions'] = train_pred

        x_train_to_save.to_csv('Test/x_train_to_save.csv', index=False)
        x_test_to_save.to_csv('Test/x_test_to_save.csv', index=False)

        st.session_state['X'] = X_train
        st.session_state['features_set'] = features_set.values[0]
        print("**" * 20, "selected model features : ", features_set.values[0])
        metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train[date_col], y_train, train_pred,
                                                                                 model, target_column=sel_target_col,
                                                                                 is_panel=is_panel)  # Sprint2

        st.plotly_chart(actual_vs_predicted_plot, use_container_width=True)

        st.markdown('## 2.3 Residual Analysis')
        columns = st.columns(2)
        with columns[0]:
            fig = plot_residual_predicted(y_train, train_pred, X_train)  # Sprint2
            st.plotly_chart(fig)

        with columns[1]:
            st.empty()
            fig = qqplot(y_train, train_pred)  # Sprint2
            st.plotly_chart(fig)

        with columns[0]:
            fig = residual_distribution(y_train, train_pred)  # Sprint2
            st.pyplot(fig)

        vif_data = pd.DataFrame()
        # X=X.drop('const',axis=1)
        X_train_orig = X_train.copy()  # Sprint2 -- creating a copy of xtrain. Later deleting panel, target & date from xtrain
        del_col_list = list(set([target_col, panel_col, date_col]).intersection(list(X_train.columns)))
        X_train.drop(columns=del_col_list, inplace=True)  # Sprint2

        vif_data["Variable"] = X_train.columns
        vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
        vif_data.sort_values(by=['VIF'], ascending=False, inplace=True)
        vif_data = np.round(vif_data)
        vif_data['VIF'] = vif_data['VIF'].astype(float)
        st.header('2.4 Variance Inflation Factor (VIF)')
        # st.dataframe(vif_data)
        color_mapping = {
            'darkgreen': (vif_data['VIF'] < 3),
            'orange': (vif_data['VIF'] >= 3) & (vif_data['VIF'] <= 10),
            'darkred': (vif_data['VIF'] > 10)
        }

        # Create a horizontal bar plot
        fig, ax = plt.subplots()
        fig.set_figwidth(10)  # Adjust the width of the figure as needed

        # Sort the bars by descending VIF values
        vif_data = vif_data.sort_values(by='VIF', ascending=False)

        # Iterate through the color mapping and plot bars with corresponding colors
        for color, condition in color_mapping.items():
            subset = vif_data[condition]
            bars = ax.barh(subset["Variable"], subset["VIF"], color=color, label=color)

            # Add text annotations on top of the bars
            for bar in bars:
                width = bar.get_width()
                ax.annotate(f'{width:}', xy=(width, bar.get_y() + bar.get_height() / 2), xytext=(5, 0),
                            textcoords='offset points', va='center')

        # Customize the plot
        ax.set_xlabel('VIF Values')
        # ax.set_title('2.4 Variance Inflation Factor (VIF)')
        # ax.legend(loc='upper right')

        # Display the plot in Streamlit
        st.pyplot(fig)

        with st.expander('Results Summary Test data'):
            # ss = MinMaxScaler()
            # X_test = pd.DataFrame(ss.fit_transform(X_test), columns=X_test.columns)
            st.header('2.2 Actual vs. Predicted Plot')

            metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_test[date_col], y_test,
                                                                                     test_pred, model,
                                                                                     target_column=sel_target_col,
                                                                                     is_panel=is_panel)  # Sprint2

            st.plotly_chart(actual_vs_predicted_plot, use_container_width=True)

            st.markdown('## 2.3 Residual Analysis')
            columns = st.columns(2)
            with columns[0]:
                fig = plot_residual_predicted(y, test_pred, X_test)  # Sprint2
                st.plotly_chart(fig)

            with columns[1]:
                st.empty()
                fig = qqplot(y, test_pred)  # Sprint2
                st.plotly_chart(fig)

            with columns[0]:
                fig = residual_distribution(y, test_pred)  # Sprint2
                st.pyplot(fig)

        value = False
        save_button_model = st.checkbox('Save this model to tune', key='build_rc_cb')  # , on_click=set_save())

        if save_button_model:
            mod_name = st.text_input('Enter model name')
            if len(mod_name) > 0:
                mod_name = mod_name + "__" + target_col  # Sprint4 - adding target col to model name
                if is_panel :
                    pred_train= model.fittedvalues
                    pred_test= mdf_predict(X_test, model, random_eff_df)
                else :
                    st.session_state['features_set'] = st.session_state['features_set'] + ['const']
                    pred_train= model.predict(X_train_orig[st.session_state['features_set']])
                    pred_test= model.predict(X_test[st.session_state['features_set']])

                st.session_state['Model'][mod_name] = {"Model_object": model,
                                                       'feature_set': st.session_state['features_set'],
                                                       'X_train': X_train_orig,
                                                       'X_test': X_test,
                                                       'y_train': y_train,
                                                       'y_test': y_test,
                                                       'pred_train':pred_train,
                                                       'pred_test': pred_test
                                                       }
                st.session_state['X_train'] = X_train_orig
                # st.session_state['X_test'] = X_test
                # st.session_state['y_train'] = y_train
                # st.session_state['y_test'] = y_test
                st.session_state['X_test_spends'] = test_spends
                # st.session_state['base_model'] = model
                # st.session_state['base_model_feature_set'] = st.session_state['features_set']
                st.session_state['saved_model_names'].append(mod_name)
                # Sprint3 additions
                if is_panel :
                    random_eff_df = get_random_effects(media_data, panel_col, model)
                    st.session_state['random_effects'] = random_eff_df

                # st.session_state['pred_train'] = model.fittedvalues
                # st.session_state['pred_test'] = mdf_predict(X_test, model, random_eff_df)
                # # End of Sprint3 additions

                with open("best_models.pkl", "wb") as f:
                    pickle.dump(st.session_state['Model'], f)
                    st.success(mod_name + ' model saved! Proceed to the next page to tune the model')
                    urm = st.session_state['used_response_metrics']
                    urm.append(sel_target_col)
                    st.session_state['used_response_metrics'] = list(set(urm))
                    mod_name = ""
                    # Sprint4 - add the formatted name of the target col to used resp metrics
                value = False