Spaces:

BlendMMM
/

Mastercard

Sleeping

File size: 25,862 Bytes

bd80083

'''
MMO Build Sprint 3
date :
changes : capability to tune MixedLM as well as simple LR in the same page
'''

import streamlit as st
import pandas as pd
from Eda_functions import format_numbers
import pickle
from utilities import set_header, load_local_css
import statsmodels.api as sm
import re
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor

st.set_option('deprecation.showPyplotGlobalUse', False)
import statsmodels.formula.api as smf
from Data_prep_functions import *

# for i in ["model_tuned", "X_train_tuned", "X_test_tuned", "tuned_model_features", "tuned_model", "tuned_model_dict"] :

st.set_page_config(
    page_title="Model Tuning",
    page_icon=":shark:",
    layout="wide",
    initial_sidebar_state='collapsed'
)
load_local_css('styles.css')
set_header()

# Sprint3
# is_panel = st.session_state['is_panel']
# panel_col = 'markets'  # set the panel column
date_col = 'date'

panel_col = [col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in  st.session_state['bin_dict']['Panel Level 1']  ] [0]# set the panel column
is_panel = True if len(panel_col)>0 else False


# flag indicating there is not tuned model till now

# Sprint4 - model tuned dict
if 'Model_Tuned' not in st.session_state:
    st.session_state['Model_Tuned'] = {}

st.title('1. Model Tuning')
# st.write(st.session_state['base_model_feature_set'])

if "X_train" not in st.session_state:
    st.error(
        "Oops! It seems there are no saved models available. Please build and save a model from the previous page to proceed.")
    st.stop()
# X_train=st.session_state['X_train']
# X_test=st.session_state['X_test']
# y_train=st.session_state['y_train']
# y_test=st.session_state['y_test']
# df=st.session_state['media_data']


# st.write(X_train.columns)
# st.write(X_test.columns)
if "is_tuned_model" not in st.session_state:
        st.session_state["is_tuned_model"] = {}
# Sprint4 - if used_response_metrics is not blank, then select one of the used_response_metrics, else target is revenue by default
if "used_response_metrics" in st.session_state and st.session_state['used_response_metrics'] != []:
    sel_target_col = st.selectbox("Select the response metric", st.session_state['used_response_metrics'])
    target_col = sel_target_col.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")

else:
    sel_target_col = 'Total Approved Accounts - Revenue'
    target_col = 'total_approved_accounts_revenue'

# Sprint4 - Look through all saved models, only show saved models of the sel resp metric (target_col)
saved_models = st.session_state['saved_model_names']
required_saved_models = [m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col]
sel_model = st.selectbox("Select the model to tune", required_saved_models)

with open("best_models.pkl", 'rb') as file:
    model_dict = pickle.load(file)

sel_model_dict = model_dict[sel_model + "__" + target_col]  # Sprint4 - get the model obj of the selected model
# st.write(sel_model_dict)

X_train = sel_model_dict['X_train']
X_test = sel_model_dict['X_test']
y_train = sel_model_dict['y_train']
y_test = sel_model_dict['y_test']
df = st.session_state['media_data']

if 'selected_model' not in st.session_state:
    st.session_state['selected_model'] = 0

# st.write(model_dict[st.session_state["selected_model"]]['X_train'].columns)

st.markdown('### 1.1 Event Flags')
st.markdown('Helps in quantifying the impact of specific occurrences of events')
with st.expander('Apply Event Flags'):
    # st.session_state["selected_model"]=st.selectbox('Select Model to apply flags',model_dict.keys())
    model = sel_model_dict['Model_object']
    date = st.session_state['date']
    date = pd.to_datetime(date)
    X_train = sel_model_dict['X_train']

    # features_set= model_dict[st.session_state["selected_model"]]['feature_set']
    features_set = sel_model_dict["feature_set"]

    col = st.columns(3)
    min_date = min(date)
    max_date = max(date)
    with col[0]:
        start_date = st.date_input('Select Start Date', min_date, min_value=min_date, max_value=max_date)
    with col[1]:
        end_date = st.date_input('Select End Date', max_date, min_value=min_date, max_value=max_date)
    with col[2]:
        repeat = st.selectbox('Repeat Annually', ['Yes', 'No'], index=1)
    if repeat == 'Yes':
        repeat = True
    else:
        repeat = False

    if 'Flags' not in st.session_state:
        st.session_state['Flags'] = {}
    # print("**"*50)
    # print(y_train)
    # print("**"*50)
    # print(model.fittedvalues)
    if is_panel:  # Sprint3
        met, line_values, fig_flag = plot_actual_vs_predicted(X_train[date_col], y_train,
                                                              model.fittedvalues, model,
                                                              target_column=sel_target_col,
                                                              flag=(start_date, end_date),
                                                              repeat_all_years=repeat, is_panel=True)
        st.plotly_chart(fig_flag, use_container_width=True)

        # create flag on test
        met, test_line_values, fig_flag = plot_actual_vs_predicted(X_test[date_col], y_test,
                                                                   sel_model_dict['pred_test'], model,
                                                                   target_column=sel_target_col,
                                                                   flag=(start_date, end_date),
                                                                   repeat_all_years=repeat, is_panel=True)

    else:
        pred_train=model.predict(X_train[features_set])
        met, line_values, fig_flag = plot_actual_vs_predicted(X_train[date_col], y_train, pred_train, model,
                                                              flag=(start_date, end_date), repeat_all_years=repeat,is_panel=False)
        st.plotly_chart(fig_flag, use_container_width=True)

        pred_test=model.predict(X_test[features_set])
        met, test_line_values, fig_flag = plot_actual_vs_predicted(X_test[date_col], y_test, pred_test, model,
                                                                   flag=(start_date, end_date), repeat_all_years=repeat,is_panel=False)
    flag_name = 'f1_flag'
    flag_name = st.text_input('Enter Flag Name')
    # Sprint4 - add selected target col to flag name
    if st.button('Update flag'):
        st.session_state['Flags'][flag_name + '__'+ target_col] = {}
        st.session_state['Flags'][flag_name + '__'+ target_col]['train'] = line_values
        st.session_state['Flags'][flag_name + '__'+ target_col]['test'] = test_line_values
        # st.write(st.session_state['Flags'][flag_name])
        st.success(f'{flag_name + "__" + target_col} stored')

    # Sprint4 - only show flag created for the particular target col
    st.write(st.session_state['Flags'].keys() )
    target_model_flags = [f.split("__")[0] for f in st.session_state['Flags'].keys() if f.split("__")[1] == target_col]
    options = list(target_model_flags)
    selected_options = []
    num_columns = 4
    num_rows = -(-len(options) // num_columns)

tick = False
if st.checkbox('Select all'):
    tick = True
selected_options = []
for row in range(num_rows):
    cols = st.columns(num_columns)
    for col in cols:
        if options:
            option = options.pop(0)
            selected = col.checkbox(option, value=tick)
            if selected:
                selected_options.append(option)

st.markdown('### 1.2 Select Parameters to Apply')
parameters = st.columns(3)
with parameters[0]:
    Trend = st.checkbox("**Trend**")
    st.markdown('Helps account for long-term trends or seasonality that could influence advertising effectiveness')
with parameters[1]:
    week_number = st.checkbox('**Week_number**')
    st.markdown('Assists in detecting and incorporating weekly patterns or seasonality')
with parameters[2]:
    sine_cosine = st.checkbox('**Sine and Cosine Waves**')
    st.markdown('Helps in capturing cyclical patterns or seasonality in the data')
#
# def get_tuned_model():
#     st.session_state['build_tuned_model']=True

if st.button('Build model with Selected Parameters and Flags', key='build_tuned_model'):
    new_features = features_set
    st.header('2.1 Results Summary')
    # date=list(df.index)
    # df = df.reset_index(drop=True)
    # st.write(df.head(2))
    # X_train=df[features_set]
    ss = MinMaxScaler()
    if is_panel == True:
        X_train_tuned = X_train[features_set]
        # X_train_tuned = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
        X_train_tuned[target_col] = X_train[target_col]
        X_train_tuned[date_col] = X_train[date_col]
        X_train_tuned[panel_col] = X_train[panel_col]

        X_test_tuned = X_test[features_set]
        # X_test_tuned = pd.DataFrame(ss.transform(X), columns=X.columns)
        X_test_tuned[target_col] = X_test[target_col]
        X_test_tuned[date_col] = X_test[date_col]
        X_test_tuned[panel_col] = X_test[panel_col]

    else:
        X_train_tuned = X_train[features_set]
        # X_train_tuned = pd.DataFrame(ss.fit_transform(X_train_tuned), columns=X_train_tuned.columns)

        X_test_tuned = X_test[features_set]
        # X_test_tuned = pd.DataFrame(ss.transform(X_test_tuned), columns=X_test_tuned.columns)

    for flag in selected_options:
        # Spirnt4 - added target_col in flag name
        X_train_tuned[flag] = st.session_state['Flags'][flag + "__" + target_col]['train']
        X_test_tuned[flag] = st.session_state['Flags'][flag + "__" + target_col]['test']

        # test
        # X_train_tuned.to_csv("Test/X_train_tuned_flag.csv",index=False)
        # X_test_tuned.to_csv("Test/X_test_tuned_flag.csv",index=False)

    # print("()()"*20,flag, len(st.session_state['Flags'][flag]))
    if Trend:
        # Sprint3 - group by panel, calculate trend of each panel spearately. Add trend to new feature set
        if is_panel:
            newdata = pd.DataFrame()
            panel_wise_end_point_train = {}
            for panel, groupdf in X_train_tuned.groupby(panel_col):
                groupdf.sort_values(date_col, inplace=True)
                groupdf['Trend'] = np.arange(1, len(groupdf) + 1, 1)
                newdata = pd.concat([newdata, groupdf])
                panel_wise_end_point_train[panel] = len(groupdf)
            X_train_tuned = newdata.copy()

            test_newdata = pd.DataFrame()
            for panel, test_groupdf in X_test_tuned.groupby(panel_col):
                test_groupdf.sort_values(date_col, inplace=True)
                start = panel_wise_end_point_train[panel] + 1
                end = start + len(test_groupdf) # should be + 1? - Sprint4
                # print("??"*20, panel, len(test_groupdf), len(np.arange(start, end, 1)), start)
                test_groupdf['Trend'] = np.arange(start, end, 1)
                test_newdata = pd.concat([test_newdata, test_groupdf])
            X_test_tuned = test_newdata.copy()

            new_features = new_features + ['Trend']

        else:
            X_train_tuned['Trend'] = np.arange(1, len(X_train_tuned) + 1, 1)
            X_test_tuned['Trend'] = np.arange(len(X_train_tuned) + 1, len(X_train_tuned) + len(X_test_tuned) + 1, 1)
            new_features = new_features + ['Trend']


    if week_number:
        # Sprint3 - create weeknumber from date column in xtrain tuned. add week num to new feature set
        if is_panel:
            X_train_tuned[date_col] = pd.to_datetime(X_train_tuned[date_col])
            X_train_tuned['Week_number'] = X_train_tuned[date_col].dt.day_of_week
            if X_train_tuned['Week_number'].nunique() == 1:
                st.write("All dates in the data are of the same week day. Hence Week number can't be used.")
            else:
                X_test_tuned[date_col] = pd.to_datetime(X_test_tuned[date_col])
                X_test_tuned['Week_number'] = X_test_tuned[date_col].dt.day_of_week
                new_features = new_features + ['Week_number']

        else:
            date = pd.to_datetime(date.values)
            X_train_tuned['Week_number'] = pd.to_datetime(X_train[date_col]).dt.day_of_week
            X_test_tuned['Week_number'] = pd.to_datetime(X_test[date_col]).dt.day_of_week
            new_features = new_features + ['Week_number']

    if sine_cosine:
        # Sprint3 - create panel wise sine cosine waves in xtrain tuned. add to new feature set
        if is_panel:
            new_features = new_features + ['sine_wave', 'cosine_wave']
            newdata = pd.DataFrame()
            newdata_test = pd.DataFrame()
            groups = X_train_tuned.groupby(panel_col)
            frequency = 2 * np.pi / 365  # Adjust the frequency as needed

            train_panel_wise_end_point = {}
            for panel, groupdf in groups:
                num_samples = len(groupdf)
                train_panel_wise_end_point[panel] = num_samples
                days_since_start = np.arange(num_samples)
                sine_wave = np.sin(frequency * days_since_start)
                cosine_wave = np.cos(frequency * days_since_start)
                sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
                assert len(sine_cosine_df) == len(groupdf)
                # groupdf = pd.concat([groupdf, sine_cosine_df], axis=1)
                groupdf['sine_wave'] = sine_wave
                groupdf['cosine_wave'] = cosine_wave
                newdata = pd.concat([newdata, groupdf])

            X_train_tuned = newdata.copy()

            test_groups = X_test_tuned.groupby(panel_col)
            for panel, test_groupdf in test_groups:
                num_samples = len(test_groupdf)
                start = train_panel_wise_end_point[panel]
                days_since_start = np.arange(start, start + num_samples, 1)
                # print("##", panel, num_samples, start, len(np.arange(start, start+num_samples, 1)))
                sine_wave = np.sin(frequency * days_since_start)
                cosine_wave = np.cos(frequency * days_since_start)
                sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
                assert len(sine_cosine_df) == len(test_groupdf)
                # groupdf = pd.concat([groupdf, sine_cosine_df], axis=1)
                test_groupdf['sine_wave'] = sine_wave
                test_groupdf['cosine_wave'] = cosine_wave
                newdata_test = pd.concat([newdata_test, test_groupdf])

            X_test_tuned = newdata_test.copy()


        else:
            new_features = new_features + ['sine_wave', 'cosine_wave']

            num_samples = len(X_train_tuned)
            frequency = 2 * np.pi / 365  # Adjust the frequency as needed
            days_since_start = np.arange(num_samples)
            sine_wave = np.sin(frequency * days_since_start)
            cosine_wave = np.cos(frequency * days_since_start)
            sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
            # Concatenate the sine and cosine waves with the scaled X DataFrame
            X_train_tuned = pd.concat([X_train_tuned, sine_cosine_df], axis=1)

            test_num_samples = len(X_test_tuned)
            start = num_samples
            days_since_start = np.arange(start, start + test_num_samples, 1)
            sine_wave = np.sin(frequency * days_since_start)
            cosine_wave = np.cos(frequency * days_since_start)
            sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
            # Concatenate the sine and cosine waves with the scaled X DataFrame
            X_test_tuned = pd.concat([X_test_tuned, sine_cosine_df], axis=1)

    # model
    if selected_options:
        new_features = new_features + selected_options
    if is_panel:
        inp_vars_str = " + ".join(new_features)
        new_features=list(set(new_features))
        # X_train_tuned.to_csv("Test/X_train_tuned.csv",index=False)
        # st.write(X_train_tuned[['total_approved_accounts_revenue'] + new_features].dtypes)
        # st.write(X_train_tuned[['total_approved_accounts_revenue', panel_col] + new_features].isna().sum())
        md_str = target_col + " ~ " + inp_vars_str
        md_tuned = smf.mixedlm(md_str,
                               data=X_train_tuned[[target_col] + new_features],
                               groups=X_train_tuned[panel_col])
        model_tuned = md_tuned.fit()

        # plot act v pred for original model and tuned model
        metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train[date_col], y_train,
                                                                                 model.fittedvalues, model,
                                                                                 target_column=sel_target_col,
                                                                                 is_panel=True)
        metrics_table_tuned, line, actual_vs_predicted_plot_tuned = plot_actual_vs_predicted(X_train_tuned[date_col],
                                                                                             X_train_tuned[target_col],
                                                                                             model_tuned.fittedvalues,
                                                                                             model_tuned,
                                                                                             target_column=sel_target_col,
                                                                                             is_panel=True)

    else:
        new_features=list(set(new_features))
        # st.write(new_features)
        model_tuned = sm.OLS(y_train, X_train_tuned[new_features]).fit()
        # st.write(X_train_tuned.columns)
        metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(date[:130], y_train,
                                                                                 model.predict(X_train[features_set]), model,
                                                                                 target_column=sel_target_col)
        metrics_table_tuned, line, actual_vs_predicted_plot_tuned = plot_actual_vs_predicted(date[:130], y_train,
                                                                                             model_tuned.predict(
                                                                                                 X_train_tuned),
                                                                                             model_tuned,
                                                                                             target_column=sel_target_col)

    # st.write(metrics_table_tuned)
    mape = np.round(metrics_table.iloc[0, 1], 2)
    r2 = np.round(metrics_table.iloc[1, 1], 2)
    adjr2 = np.round(metrics_table.iloc[2, 1], 2)

    mape_tuned = np.round(metrics_table_tuned.iloc[0, 1], 2)
    r2_tuned = np.round(metrics_table_tuned.iloc[1, 1], 2)
    adjr2_tuned = np.round(metrics_table_tuned.iloc[2, 1], 2)

    parameters_ = st.columns(3)
    with parameters_[0]:
        st.metric('R2', r2_tuned, np.round(r2_tuned - r2, 2))
    with parameters_[1]:
        st.metric('Adjusted R2', adjr2_tuned, np.round(adjr2_tuned - adjr2, 2))
    with parameters_[2]:
        st.metric('MAPE', mape_tuned, np.round(mape_tuned - mape, 2), 'inverse')
    st.write(model_tuned.summary())

    X_train_tuned[date_col] = X_train[date_col]
    X_test_tuned[date_col] = X_test[date_col]
    X_train_tuned[target_col] = y_train
    X_test_tuned[target_col] = y_test

    st.header('2.2 Actual vs. Predicted Plot')
    # if is_panel:
    #   metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(date, y_train, model.predict(X_train),
    #                                                                              model, target_column='Revenue',is_panel=True)
    # else:
    #   metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.predict(X_train), model,target_column='Revenue')
    if is_panel :
        metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train_tuned[date_col],
                                                                                 X_train_tuned[target_col],
                                                                                 model_tuned.fittedvalues, model_tuned,
                                                                                 target_column=sel_target_col,
                                                                                 is_panel=True)
    else :
        metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train_tuned[date_col],
                                                                                 X_train_tuned[target_col],
                                                                                 model_tuned.predict(X_train_tuned[new_features]),
                                                                                 model_tuned,
                                                                                 target_column=sel_target_col,
                                                                                 is_panel=False)
    # plot_actual_vs_predicted(X_train[date_col], y_train,
    #                                                                             model.fittedvalues, model,
    #                                                                             target_column='Revenue',
    #                                                                             is_panel=is_panel)

    st.plotly_chart(actual_vs_predicted_plot, use_container_width=True)

    st.markdown('## 2.3 Residual Analysis')
    if is_panel :
        columns = st.columns(2)
        with columns[0]:
            fig = plot_residual_predicted(y_train, model_tuned.fittedvalues, X_train_tuned)
            st.plotly_chart(fig)

        with columns[1]:
            st.empty()
            fig = qqplot(y_train, model_tuned.fittedvalues)
            st.plotly_chart(fig)

        with columns[0]:
            fig = residual_distribution(y_train, model_tuned.fittedvalues)
            st.pyplot(fig)
    else:
        columns = st.columns(2)
        with columns[0]:
            fig = plot_residual_predicted(y_train, model_tuned.predict(X_train_tuned[new_features]), X_train)
            st.plotly_chart(fig)

        with columns[1]:
            st.empty()
            fig = qqplot(y_train, model_tuned.predict(X_train_tuned[new_features]))
            st.plotly_chart(fig)

        with columns[0]:
            fig = residual_distribution(y_train, model_tuned.predict(X_train_tuned[new_features]))
            st.pyplot(fig)

    st.session_state['is_tuned_model'][target_col] = True
    # Sprint4 - saved tuned model in a dict
    st.session_state['Model_Tuned'][sel_model + "__" + target_col] = {
                                                                      "Model_object": model_tuned,
                                                                      'feature_set': new_features,
                                                                      'X_train_tuned': X_train_tuned,
                                                                      'X_test_tuned': X_test_tuned
                                                                      }

# Pending
# if st.session_state['build_tuned_model']==True:
if st.session_state['Model_Tuned'] is not None :
    if st.checkbox('Use this model to build response curves', key='save_model'):
        #   save_model = st.button('Use this model to build response curves', key='saved_tuned_model')
        #   if save_model:
        st.session_state["is_tuned_model"][target_col]=True
        with open("tuned_model.pkl", "wb") as f:
            # pickle.dump(st.session_state['tuned_model'], f)
            pickle.dump(st.session_state['Model_Tuned'], f)  # Sprint4

        # X_test_tuned.to_csv("Test/X_test_tuned_final.csv", index=False)
        # X_train_tuned.to_csv("Test/X_train_tuned.csv", index=False)
        st.success(sel_model + "__" + target_col + ' Tuned saved!')


    # if is_panel:
    #     # st.session_state["tuned_model_features"] = new_features
    #     with open("tuned_model.pkl", "wb") as f:
    #         # pickle.dump(st.session_state['tuned_model'], f)
    #         pickle.dump(st.session_state['Model_Tuned'], f)  # Sprint4
    #     st.success(sel_model + "__" + target_col + ' Tuned saved!')

#   raw_data=df[features_set]
#   columns_raw=[re.split(r"(_lag|_adst)",col)[0] for col in raw_data.columns]
#   raw_data.columns=columns_raw
#   columns_media=[col for col in columns_raw if Categorised_data[col]['BB']=='Media']
#   raw_data=raw_data[columns_media]

#   raw_data['Date']=list(df.index)

#   spends_var=[col for col in df.columns if "spends" in col.lower() and 'adst' not in col.lower() and 'lag' not in col.lower()]
#   spends_df=df[spends_var]
#   spends_df['Week']=list(df.index)


#   j=0
#   X1=X.copy()
#   col=X1.columns
#   for i in model.params.values:
#       X1[col[j]]=X1.iloc[:,j]*i
#       j+=1
#   contribution_df=X1
#   contribution_df['Date']=list(df.index)
#   excel_file='Overview_data.xlsx'

#   with pd.ExcelWriter(excel_file,engine='xlsxwriter') as writer:
#      raw_data.to_excel(writer,sheet_name='RAW DATA MMM',index=False)
#      spends_df.to_excel(writer,sheet_name='SPEND INPUT',index=False)
#      contribution_df.to_excel(writer,sheet_name='CONTRIBUTION MMM')