Spaces:

BlendMMM
/

Mastercard

Sleeping

File size: 19,414 Bytes

'''
MMO Build Sprint 3
date :
changes : capability to tune MixedLM as well as simple LR in the same page
'''

import streamlit as st
import pandas as pd
from Eda_functions import format_numbers
import pickle
from utilities import set_header,load_local_css
import statsmodels.api as sm
import re
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
st.set_option('deprecation.showPyplotGlobalUse', False)
import statsmodels.formula.api as smf
from Data_prep_functions import *

for i in ["model_tuned", "X_train_tuned", "X_test_tuned", "tuned_model_features"] :
    if i not in st.session_state :
        st.session_state[i] = None

st.set_page_config(
  page_title="Model Tuning",
  page_icon=":shark:",
  layout="wide",
  initial_sidebar_state='collapsed'
)
load_local_css('styles.css')
set_header()

# Sprint3
is_panel= True
panel_col= 'markets' # set the panel column
date_col = 'date'
target_col = 'total_approved_accounts_revenue'

st.title('1. Model Tuning')


if "X_train" not in st.session_state:
   st.error(
"Oops! It seems there are no saved models available. Please build and save a model from the previous page to proceed.")
   st.stop()
X_train=st.session_state['X_train']
X_test=st.session_state['X_test']
y_train=st.session_state['y_train']
y_test=st.session_state['y_test']
df=st.session_state['media_data']

# st.write(X_train.columns)
# st.write(X_test.columns)

with open("best_models.pkl", 'rb') as file:
  model_dict= pickle.load(file)

if 'selected_model' not in st.session_state:
   st.session_state['selected_model']=0

# st.write(model_dict[st.session_state["selected_model"]]['X_train'].columns)

st.markdown('### 1.1 Event Flags')
st.markdown('Helps in quantifying the impact of specific occurrences of events')
with st.expander('Apply Event Flags'):
  st.session_state["selected_model"]=st.selectbox('Select Model to apply flags',model_dict.keys())
  model =model_dict[st.session_state["selected_model"]]['Model_object']
  date=st.session_state['date']
  date=pd.to_datetime(date)
  X_train =model_dict[st.session_state["selected_model"]]['X_train']

  features_set= model_dict[st.session_state["selected_model"]]['feature_set']

  col=st.columns(3)
  min_date=min(date)
  max_date=max(date)
  with col[0]:
    start_date=st.date_input('Select Start Date',min_date,min_value=min_date,max_value=max_date)
  with col[1]:
    end_date=st.date_input('Select End Date',max_date,min_value=min_date,max_value=max_date)
  with col[2]:
    repeat=st.selectbox('Repeat Annually',['Yes','No'],index=1)
  if repeat =='Yes':
      repeat=True
  else:
      repeat=False
  # X_train=sm.add_constant(X_train)

  if 'Flags' not in st.session_state:
    st.session_state['Flags']={}
  # print("**"*50)
  # print(y_train)
  # print("**"*50)
  # print(model.fittedvalues)
  if is_panel : # Sprint3
      met, line_values, fig_flag = plot_actual_vs_predicted(X_train[date_col], y_train,
                                                            model.fittedvalues, model,
                                                            target_column='Revenue',
                                                            flag=(start_date, end_date),
                                                            repeat_all_years=repeat, is_panel=True)
      st.plotly_chart(fig_flag, use_container_width=True)

      # create flag on test
      met, test_line_values, fig_flag = plot_actual_vs_predicted(X_test[date_col], y_test,
                                                            st.session_state['pred_test'], model,
                                                            target_column='Revenue',
                                                            flag=(start_date, end_date),
                                                            repeat_all_years=repeat, is_panel=True)

  else :
      met,line_values,fig_flag=plot_actual_vs_predicted(date[:150], y_train, model.predict(X_train), model,flag=(start_date,end_date),repeat_all_years=repeat)
      st.plotly_chart(fig_flag,use_container_width=True)

      met,test_line_values,fig_flag=plot_actual_vs_predicted(date[150:], y_test, model.predict(X_test), model,flag=(start_date,end_date),repeat_all_years=repeat)


  flag_name='f1'
  flag_name=st.text_input('Enter Flag Name')
  if st.button('Update flag'):
    st.session_state['Flags'][flag_name]= {}
    st.session_state['Flags'][flag_name]['train']=line_values
    st.session_state['Flags'][flag_name]['test']=test_line_values
    # st.write(st.session_state['Flags'][flag_name])
    st.success(f'{flag_name} stored')

  options=list(st.session_state['Flags'].keys())
  selected_options = []
  num_columns = 4
  num_rows = -(-len(options) // num_columns)


tick=False
if st.checkbox('Select all'):
    tick=True
selected_options = []
for row in range(num_rows):
    cols = st.columns(num_columns)
    for col in cols:
        if options:
            option = options.pop(0)
            selected = col.checkbox(option,value=tick)
            if selected:
                selected_options.append(option)

st.markdown('### 1.2 Select Parameters to Apply')
parameters=st.columns(3)
with parameters[0]:
   Trend=st.checkbox("**Trend**")
   st.markdown('Helps account for long-term trends or seasonality that could influence advertising effectiveness')
with parameters[1]:
   week_number=st.checkbox('**Week_number**')
   st.markdown('Assists in detecting and incorporating weekly patterns or seasonality')
with parameters[2]:
   sine_cosine=st.checkbox('**Sine and Cosine Waves**')
   st.markdown('Helps in capturing cyclical patterns or seasonality in the data')

if st.button('Build model with Selected Parameters and Flags'):
  st.header('2.1 Results Summary')
  # date=list(df.index)
  # df = df.reset_index(drop=True)
  # st.write(df.head(2))
  # X_train=df[features_set]
  ss = MinMaxScaler()
  if is_panel == True :
    X = X_train[features_set]
    X_train_tuned = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
    X_train_tuned[target_col] = X_train[target_col]
    X_train_tuned[date_col] = X_train[date_col]
    X_train_tuned[panel_col] = X_train[panel_col]

    X = X_test[features_set]
    X_test_tuned = pd.DataFrame(ss.transform(X), columns=X.columns)
    X_test_tuned[target_col] = X_test[target_col]
    X_test_tuned[date_col] = X_test[date_col]
    X_test_tuned[panel_col] = X_test[panel_col]

  else :
    X_train_tuned = pd.DataFrame(ss.fit_transform(X_train), columns=X_train.columns)
    X_train_tuned = sm.add_constant(X_train_tuned)

    X_test_tuned = pd.DataFrame(ss.transform(X_test), columns=X_test.columns)
    X_test_tuned = sm.add_constant(X_test_tuned)

  for flag in selected_options:
    X_train_tuned[flag]=st.session_state['Flags'][flag]['train']
    X_test_tuned[flag]=st.session_state['Flags'][flag]['test']

    #test
    # X_train_tuned.to_csv("Test/X_train_tuned_flag.csv",index=False)
    # X_test_tuned.to_csv("Test/X_test_tuned_flag.csv",index=False)

  new_features = features_set
    # print("()()"*20,flag, len(st.session_state['Flags'][flag]))
  if Trend:
     # Sprint3 - group by panel, calculate trend of each panel spearately. Add trend to new feature set
     if is_panel :
         newdata = pd.DataFrame()
         panel_wise_end_point_train = {}
         for panel, groupdf in X_train_tuned.groupby(panel_col):
             groupdf.sort_values(date_col, inplace=True)
             groupdf['Trend'] = np.arange(1, len(groupdf) + 1, 1)
             newdata = pd.concat([newdata, groupdf])
             panel_wise_end_point_train[panel] = len(groupdf)
         X_train_tuned = newdata.copy()

         test_newdata=pd.DataFrame()
         for panel, test_groupdf in X_test_tuned.groupby(panel_col):
             test_groupdf.sort_values(date_col, inplace=True)
             start = panel_wise_end_point_train[panel]+1
             end = start + len(test_groupdf)
             # print("??"*20, panel, len(test_groupdf), len(np.arange(start, end, 1)), start)
             test_groupdf['Trend'] = np.arange(start, end, 1)
             test_newdata = pd.concat([test_newdata, test_groupdf])
         X_test_tuned = test_newdata.copy()

         new_features = new_features + ['Trend']

         # test
         X_test_tuned.to_csv("Test/X_test_tuned_trend.csv", index=False)
         X_train_tuned.to_csv("Test/X_train_tuned_trend.csv", index=False)
         pd.concat([X_train_tuned,X_test_tuned]).sort_values([panel_col, date_col]).to_csv("Test/X_train_test_tuned_trend.csv", index=False)

     else :
         X_train_tuned['Trend']=np.arange(1,len(X_train_tuned)+1,1)
         X_test_tuned['Trend'] = np.arange(len(X_train_tuned)+1, len(X_train_tuned)+len(X_test_tuned), 1)

  if week_number :
     # Sprint3 - create weeknumber from date column in xtrain tuned. add week num to new feature set
     if is_panel :
        X_train_tuned[date_col] = pd.to_datetime(X_train_tuned[date_col])
        X_train_tuned['Week_number'] = X_train_tuned[date_col].dt.day_of_week
        if X_train_tuned['Week_number'].nunique() == 1 :
            st.write("All dates in the data are of the same week day. Hence Week number can't be used.")
        else :
            X_test_tuned[date_col] = pd.to_datetime(X_test_tuned[date_col])
            X_test_tuned['Week_number'] = X_test_tuned[date_col].dt.day_of_week
            new_features = new_features + ['Week_number']

     else :
        date = pd.to_datetime(date.values)
        X_train_tuned['Week_number'] = date.dt.day_of_week[:150]
        X_test_tuned['Week_number'] = date.dt.day_of_week[150:]

  if sine_cosine :
      # Sprint3 - create panel wise sine cosine waves in xtrain tuned. add to new feature set
      if is_panel :
        new_features = new_features + ['sine_wave', 'cosine_wave']
        newdata = pd.DataFrame()
        groups = X_train_tuned.groupby(panel_col)
        frequency = 2 * np.pi / 365  # Adjust the frequency as needed

        train_panel_wise_end_point = {}
        for panel, groupdf in groups:
            num_samples = len(groupdf)
            train_panel_wise_end_point[panel] = num_samples
            days_since_start = np.arange(num_samples)
            sine_wave = np.sin(frequency * days_since_start)
            cosine_wave = np.cos(frequency * days_since_start)
            sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
            assert len(sine_cosine_df) == len(groupdf)
            # groupdf = pd.concat([groupdf, sine_cosine_df], axis=1)
            groupdf['sine_wave'] = sine_wave
            groupdf['cosine_wave'] = cosine_wave
            newdata = pd.concat([newdata, groupdf])

        test_groups = X_test_tuned.groupby(panel_col)
        for panel, test_groupdf in test_groups:
            num_samples = len(test_groupdf)
            start = train_panel_wise_end_point[panel]
            days_since_start = np.arange(start, start+num_samples, 1)
            # print("##", panel, num_samples, start, len(np.arange(start, start+num_samples, 1)))
            sine_wave = np.sin(frequency * days_since_start)
            cosine_wave = np.cos(frequency * days_since_start)
            sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
            assert len(sine_cosine_df) == len(test_groupdf)
            # groupdf = pd.concat([groupdf, sine_cosine_df], axis=1)
            test_groupdf['sine_wave'] = sine_wave
            test_groupdf['cosine_wave'] = cosine_wave
            newdata = pd.concat([newdata, test_groupdf])

        X_train_tuned = newdata.copy()


      else :
        num_samples = len(X_train_tuned)
        frequency = 2 * np.pi / 365  # Adjust the frequency as needed
        days_since_start = np.arange(num_samples)
        sine_wave = np.sin(frequency * days_since_start)
        cosine_wave = np.cos(frequency * days_since_start)
        sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
        # Concatenate the sine and cosine waves with the scaled X DataFrame
        X_train_tuned = pd.concat([X_train_tuned, sine_cosine_df], axis=1)

        test_num_samples = len(X_test_tuned)
        start = num_samples
        days_since_start = np.arange(start, start+test_num_samples, 1)
        sine_wave = np.sin(frequency * days_since_start)
        cosine_wave = np.cos(frequency * days_since_start)
        sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
        # Concatenate the sine and cosine waves with the scaled X DataFrame
        X_test_tuned = pd.concat([X_test_tuned, sine_cosine_df], axis=1)

  # model

  if is_panel :
      if selected_options :
        new_features =  new_features + selected_options

      inp_vars_str = " + ".join(new_features)

      # X_train_tuned.to_csv("Test/X_train_tuned.csv",index=False)
      # st.write(X_train_tuned[['total_approved_accounts_revenue'] + new_features].dtypes)
      # st.write(X_train_tuned[['total_approved_accounts_revenue', panel_col] + new_features].isna().sum())

      md_tuned = smf.mixedlm("total_approved_accounts_revenue ~ {}".format(inp_vars_str),
                             data=X_train_tuned[['total_approved_accounts_revenue'] + new_features],
                             groups=X_train_tuned[panel_col])
      model_tuned = md_tuned.fit()



      # plot act v pred for original model and tuned model
      metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train[date_col], y_train,
                                                                               model.fittedvalues, model,
                                                                               target_column='Revenue',
                                                                               is_panel=True)
      metrics_table_tuned, line, actual_vs_predicted_plot_tuned = plot_actual_vs_predicted(X_train_tuned[date_col],
                                                                                           X_train_tuned[target_col],
                                                                                           model_tuned.fittedvalues,
                                                                                           model_tuned,
                                                                                           target_column='Revenue',
                                                                                           is_panel=True)

  else :
      model_tuned = sm.OLS(y_train, X_train_tuned).fit()

      metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(date[:150], y_train,
                                                                               model.predict(X_train), model,
                                                                               target_column='Revenue')
      metrics_table_tuned, line, actual_vs_predicted_plot_tuned = plot_actual_vs_predicted(date[:150], y_train,
                                                                                           model_tuned.predict(
                                                                                               X_train_tuned),
                                                                                           model_tuned,
                                                                                           target_column='Revenue')

  # st.write(metrics_table_tuned)
  mape=np.round(metrics_table.iloc[0,1],2)
  r2=np.round(metrics_table.iloc[1,1],2)
  adjr2=np.round(metrics_table.iloc[2,1],2)

  mape_tuned=np.round(metrics_table_tuned.iloc[0,1],2)
  r2_tuned=np.round(metrics_table_tuned.iloc[1,1],2)
  adjr2_tuned=np.round(metrics_table_tuned.iloc[2,1],2)

  parameters_=st.columns(3)
  with parameters_[0]:
     st.metric('R2',r2_tuned,np.round(r2_tuned-r2,2))
  with parameters_[1]:
     st.metric('Adjusted R2',adjr2_tuned,np.round(adjr2_tuned-adjr2,2))
  with parameters_[2]:
     st.metric('MAPE',mape_tuned,np.round(mape_tuned-mape,2),'inverse')

  st.header('2.2 Actual vs. Predicted Plot')
  # if is_panel:
  #   metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(date, y_train, model.predict(X_train),
  #                                                                              model, target_column='Revenue',is_panel=True)
  # else:
  #   metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.predict(X_train), model,target_column='Revenue')

  metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(X_train_tuned[date_col], X_train_tuned[target_col],
                                                                                  model_tuned.fittedvalues, model_tuned,
                                                                                  target_column='Revenue',
                                                                                  is_panel=True)
      # plot_actual_vs_predicted(X_train[date_col], y_train,
      #                                                                             model.fittedvalues, model,
      #                                                                             target_column='Revenue',
      #                                                                             is_panel=is_panel)

  st.plotly_chart(actual_vs_predicted_plot,use_container_width=True)



  st.markdown('## 2.3 Residual Analysis')
  columns=st.columns(2)
  with columns[0]:
    fig=plot_residual_predicted(y_train,model.predict(X_train),X_train)
    st.plotly_chart(fig)

  with columns[1]:
    st.empty()
    fig = qqplot(y_train,model.predict(X_train))
    st.plotly_chart(fig)

  with columns[0]:
    fig=residual_distribution(y_train,model.predict(X_train))
    st.pyplot(fig)

  if st.checkbox('Use this model to build response curves',key='123'):
    st.session_state["tuned_model"] = model_tuned
    st.session_state["X_train_tuned"] = X_train_tuned
    st.session_state["X_test_tuned"] = X_test_tuned
    st.session_state["X_train_tuned"] = X_train_tuned
    st.session_state["X_test_tuned"] = X_test_tuned
    if is_panel :
        st.session_state["tuned_model_features"] = new_features
    with open("tuned_model.pkl", "wb") as f:
        pickle.dump(st.session_state['tuned_model'], f)
        st.success('Model saved!')

#   raw_data=df[features_set]
#   columns_raw=[re.split(r"(_lag|_adst)",col)[0] for col in raw_data.columns]
#   raw_data.columns=columns_raw
#   columns_media=[col for col in columns_raw if Categorised_data[col]['BB']=='Media']
#   raw_data=raw_data[columns_media]

#   raw_data['Date']=list(df.index)

#   spends_var=[col for col in df.columns if "spends" in col.lower() and 'adst' not in col.lower() and 'lag' not in col.lower()]
#   spends_df=df[spends_var]
#   spends_df['Week']=list(df.index)


#   j=0
#   X1=X.copy()
#   col=X1.columns
#   for i in model.params.values:
#       X1[col[j]]=X1.iloc[:,j]*i
#       j+=1
#   contribution_df=X1
#   contribution_df['Date']=list(df.index)
#   excel_file='Overview_data.xlsx'

#   with pd.ExcelWriter(excel_file,engine='xlsxwriter') as writer:
#      raw_data.to_excel(writer,sheet_name='RAW DATA MMM',index=False)
#      spends_df.to_excel(writer,sheet_name='SPEND INPUT',index=False)
#      contribution_df.to_excel(writer,sheet_name='CONTRIBUTION MMM')