''' MMO Build Sprint 3 date : changes : capability to tune MixedLM as well as simple LR in the same page ''' import streamlit as st import pandas as pd from Eda_functions import format_numbers import pickle from utilities import set_header, load_local_css import statsmodels.api as sm import re from sklearn.preprocessing import MinMaxScaler import matplotlib.pyplot as plt from statsmodels.stats.outliers_influence import variance_inflation_factor st.set_option('deprecation.showPyplotGlobalUse', False) import statsmodels.formula.api as smf from Data_prep_functions import * # for i in ["model_tuned", "X_train_tuned", "X_test_tuned", "tuned_model_features", "tuned_model", "tuned_model_dict"] : st.set_page_config( page_title="Model Tuning", page_icon=":shark:", layout="wide", initial_sidebar_state='collapsed' ) load_local_css('styles.css') set_header() # Sprint3 # is_panel = st.session_state['is_panel'] # panel_col = 'markets' # set the panel column date_col = 'date' panel_col = [col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in st.session_state['bin_dict']['Panel Level 1'] ] [0]# set the panel column is_panel = True if len(panel_col)>0 else False # flag indicating there is not tuned model till now # Sprint4 - model tuned dict if 'Model_Tuned' not in st.session_state: st.session_state['Model_Tuned'] = {} st.title('1. Model Tuning') # st.write(st.session_state['base_model_feature_set']) if "X_train" not in st.session_state: st.error( "Oops! It seems there are no saved models available. Please build and save a model from the previous page to proceed.") st.stop() # X_train=st.session_state['X_train'] # X_test=st.session_state['X_test'] # y_train=st.session_state['y_train'] # y_test=st.session_state['y_test'] # df=st.session_state['media_data'] # st.write(X_train.columns) # st.write(X_test.columns) if "is_tuned_model" not in st.session_state: st.session_state["is_tuned_model"] = {} # Sprint4 - if used_response_metrics is not blank, then select one of the used_response_metrics, else target is revenue by default if "used_response_metrics" in st.session_state and st.session_state['used_response_metrics'] != []: sel_target_col = st.selectbox("Select the response metric", st.session_state['used_response_metrics']) target_col = sel_target_col.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") else: sel_target_col = 'Total Approved Accounts - Revenue' target_col = 'total_approved_accounts_revenue' # Sprint4 - Look through all saved models, only show saved models of the sel resp metric (target_col) saved_models = st.session_state['saved_model_names'] required_saved_models = [m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col] sel_model = st.selectbox("Select the model to tune", required_saved_models) with open("best_models.pkl", 'rb') as file: model_dict = pickle.load(file) sel_model_dict = model_dict[sel_model + "__" + target_col] # Sprint4 - get the model obj of the selected model # st.write(sel_model_dict) X_train = sel_model_dict['X_train'] X_test = sel_model_dict['X_test'] y_train = sel_model_dict['y_train'] y_test = sel_model_dict['y_test'] df = st.session_state['media_data'] if 'selected_model' not in st.session_state: st.session_state['selected_model'] = 0 # st.write(model_dict[st.session_state["selected_model"]]['X_train'].columns) st.markdown('### 1.1 Event Flags') st.markdown('Helps in quantifying the impact of specific occurrences of events') with st.expander('Apply Event Flags'): # st.session_state["selected_model"]=st.selectbox('Select Model to apply flags',model_dict.keys()) model = sel_model_dict['Model_object'] date = st.session_state['date'] date = pd.to_datetime(date) X_train = sel_model_dict['X_train'] # features_set= model_dict[st.session_state["selected_model"]]['feature_set'] features_set = sel_model_dict["feature_set"] col = st.columns(3) min_date = min(date) max_date = max(date) with col[0]: start_date = st.date_input('Select Start Date', min_date, min_value=min_date, max_value=max_date) with col[1]: end_date = st.date_input('Select End Date', max_date, min_value=min_date, max_value=max_date) with col[2]: repeat = st.selectbox('Repeat Annually', ['Yes', 'No'], index=1) if repeat == 'Yes': repeat = True else: repeat = False if 'Flags' not in st.session_state: st.session_state['Flags'] = {} # print("**"*50) # print(y_train) # print("**"*50) # print(model.fittedvalues) if is_panel: # Sprint3 met, line_values, fig_flag = plot_actual_vs_predicted(X_train[date_col], y_train, model.fittedvalues, model, target_column=sel_target_col, flag=(start_date, end_date), repeat_all_years=repeat, is_panel=True) st.plotly_chart(fig_flag, use_container_width=True) # create flag on test met, test_line_values, fig_flag = plot_actual_vs_predicted(X_test[date_col], y_test, sel_model_dict['pred_test'], model, target_column=sel_target_col, flag=(start_date, end_date), repeat_all_years=repeat, is_panel=True) else: pred_train=model.predict(X_train[features_set]) met, line_values, fig_flag = plot_actual_vs_predicted(X_train[date_col], y_train, pred_train, model, flag=(start_date, end_date), repeat_all_years=repeat,is_panel=False) st.plotly_chart(fig_flag, use_container_width=True) pred_test=model.predict(X_test[features_set]) met, test_line_values, fig_flag = plot_actual_vs_predicted(X_test[date_col], y_test, pred_test, model, flag=(start_date, end_date), repeat_all_years=repeat,is_panel=False) flag_name = 'f1_flag' flag_name = st.text_input('Enter Flag Name') # Sprint4 - add selected target col to flag name if st.button('Update flag'): st.session_state['Flags'][flag_name + '__'+ target_col] = {} st.session_state['Flags'][flag_name + '__'+ target_col]['train'] = line_values st.session_state['Flags'][flag_name + '__'+ target_col]['test'] = test_line_values # st.write(st.session_state['Flags'][flag_name]) st.success(f'{flag_name + "__" + target_col} stored') # Sprint4 - only show flag created for the particular target col st.write(st.session_state['Flags'].keys() ) target_model_flags = [f.split("__")[0] for f in st.session_state['Flags'].keys() if f.split("__")[1] == target_col] options = list(target_model_flags) selected_options = [] num_columns = 4 num_rows = -(-len(options) // num_columns) tick = False if st.checkbox('Select all'): tick = True selected_options = [] for row in range(num_rows): cols = st.columns(num_columns) for col in cols: if options: option = options.pop(0) selected = col.checkbox(option, value=tick) if selected: selected_options.append(option) st.markdown('### 1.2 Select Parameters to Apply') parameters = st.columns(3) with parameters[0]: Trend = st.checkbox("**Trend**") st.markdown('Helps account for long-term trends or seasonality that could influence advertising effectiveness') with parameters[1]: week_number = st.checkbox('**Week_number**') st.markdown('Assists in detecting and incorporating weekly patterns or seasonality') with parameters[2]: sine_cosine = st.checkbox('**Sine and Cosine Waves**') st.markdown('Helps in capturing cyclical patterns or seasonality in the data') # # def get_tuned_model(): # st.session_state['build_tuned_model']=True if st.button('Build model with Selected Parameters and Flags', key='build_tuned_model'): new_features = features_set st.header('2.1 Results Summary') # date=list(df.index) # df = df.reset_index(drop=True) # st.write(df.head(2)) # X_train=df[features_set] ss = MinMaxScaler() if is_panel == True: X_train_tuned = X_train[features_set] # X_train_tuned = pd.DataFrame(ss.fit_transform(X), columns=X.columns) X_train_tuned[target_col] = X_train[target_col] X_train_tuned[date_col] = X_train[date_col] X_train_tuned[panel_col] = X_train[panel_col] X_test_tuned = X_test[features_set] # X_test_tuned = pd.DataFrame(ss.transform(X), columns=X.columns) X_test_tuned[target_col] = X_test[target_col] X_test_tuned[date_col] = X_test[date_col] X_test_tuned[panel_col] = X_test[panel_col] else: X_train_tuned = X_train[features_set] # X_train_tuned = pd.DataFrame(ss.fit_transform(X_train_tuned), columns=X_train_tuned.columns) X_test_tuned = X_test[features_set] # X_test_tuned = pd.DataFrame(ss.transform(X_test_tuned), columns=X_test_tuned.columns) for flag in selected_options: # Spirnt4 - added target_col in flag name X_train_tuned[flag] = st.session_state['Flags'][flag + "__" + target_col]['train'] X_test_tuned[flag] = st.session_state['Flags'][flag + "__" + target_col]['test'] # test # X_train_tuned.to_csv("Test/X_train_tuned_flag.csv",index=False) # X_test_tuned.to_csv("Test/X_test_tuned_flag.csv",index=False) # print("()()"*20,flag, len(st.session_state['Flags'][flag])) if Trend: # Sprint3 - group by panel, calculate trend of each panel spearately. Add trend to new feature set if is_panel: newdata = pd.DataFrame() panel_wise_end_point_train = {} for panel, groupdf in X_train_tuned.groupby(panel_col): groupdf.sort_values(date_col, inplace=True) groupdf['Trend'] = np.arange(1, len(groupdf) + 1, 1) newdata = pd.concat([newdata, groupdf]) panel_wise_end_point_train[panel] = len(groupdf) X_train_tuned = newdata.copy() test_newdata = pd.DataFrame() for panel, test_groupdf in X_test_tuned.groupby(panel_col): test_groupdf.sort_values(date_col, inplace=True) start = panel_wise_end_point_train[panel] + 1 end = start + len(test_groupdf) # should be + 1? - Sprint4 # print("??"*20, panel, len(test_groupdf), len(np.arange(start, end, 1)), start) test_groupdf['Trend'] = np.arange(start, end, 1) test_newdata = pd.concat([test_newdata, test_groupdf]) X_test_tuned = test_newdata.copy() new_features = new_features + ['Trend'] else: X_train_tuned['Trend'] = np.arange(1, len(X_train_tuned) + 1, 1) X_test_tuned['Trend'] = np.arange(len(X_train_tuned) + 1, len(X_train_tuned) + len(X_test_tuned) + 1, 1) new_features = new_features + ['Trend'] if week_number: # Sprint3 - create weeknumber from date column in xtrain tuned. add week num to new feature set if is_panel: X_train_tuned[date_col] = pd.to_datetime(X_train_tuned[date_col]) X_train_tuned['Week_number'] = X_train_tuned[date_col].dt.day_of_week if X_train_tuned['Week_number'].nunique() == 1: st.write("All dates in the data are of the same week day. Hence Week number can't be used.") else: X_test_tuned[date_col] = pd.to_datetime(X_test_tuned[date_col]) X_test_tuned['Week_number'] = X_test_tuned[date_col].dt.day_of_week new_features = new_features + ['Week_number'] else: date = pd.to_datetime(date.values) X_train_tuned['Week_number'] = pd.to_datetime(X_train[date_col]).dt.day_of_week X_test_tuned['Week_number'] = pd.to_datetime(X_test[date_col]).dt.day_of_week new_features = new_features + ['Week_number'] if sine_cosine: # Sprint3 - create panel wise sine cosine waves in xtrain tuned. add to new feature set if is_panel: new_features = new_features + ['sine_wave', 'cosine_wave'] newdata = pd.DataFrame() newdata_test = pd.DataFrame() groups = X_train_tuned.groupby(panel_col) frequency = 2 * np.pi / 365 # Adjust the frequency as needed train_panel_wise_end_point = {} for panel, groupdf in groups: num_samples = len(groupdf) train_panel_wise_end_point[panel] = num_samples days_since_start = np.arange(num_samples) sine_wave = np.sin(frequency * days_since_start) cosine_wave = np.cos(frequency * days_since_start) sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave}) assert len(sine_cosine_df) == len(groupdf) # groupdf = pd.concat([groupdf, sine_cosine_df], axis=1) groupdf['sine_wave'] = sine_wave groupdf['cosine_wave'] = cosine_wave newdata = pd.concat([newdata, groupdf]) X_train_tuned = newdata.copy() test_groups = X_test_tuned.groupby(panel_col) for panel, test_groupdf in test_groups: num_samples = len(test_groupdf) start = train_panel_wise_end_point[panel] days_since_start = np.arange(start, start + num_samples, 1) # print("##", panel, num_samples, start, len(np.arange(start, start+num_samples, 1))) sine_wave = np.sin(frequency * days_since_start) cosine_wave = np.cos(frequency * days_since_start) sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave}) assert len(sine_cosine_df) == len(test_groupdf) # groupdf = pd.concat([groupdf, sine_cosine_df], axis=1) test_groupdf['sine_wave'] = sine_wave test_groupdf['cosine_wave'] = cosine_wave newdata_test = pd.concat([newdata_test, test_groupdf]) X_test_tuned = newdata_test.copy() else: new_features = new_features + ['sine_wave', 'cosine_wave'] num_samples = len(X_train_tuned) frequency = 2 * np.pi / 365 # Adjust the frequency as needed days_since_start = np.arange(num_samples) sine_wave = np.sin(frequency * days_since_start) cosine_wave = np.cos(frequency * days_since_start) sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave}) # Concatenate the sine and cosine waves with the scaled X DataFrame X_train_tuned = pd.concat([X_train_tuned, sine_cosine_df], axis=1) test_num_samples = len(X_test_tuned) start = num_samples days_since_start = np.arange(start, start + test_num_samples, 1) sine_wave = np.sin(frequency * days_since_start) cosine_wave = np.cos(frequency * days_since_start) sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave}) # Concatenate the sine and cosine waves with the scaled X DataFrame X_test_tuned = pd.concat([X_test_tuned, sine_cosine_df], axis=1) # model if selected_options: new_features = new_features + selected_options if is_panel: inp_vars_str = " + ".join(new_features) new_features=list(set(new_features)) # X_train_tuned.to_csv("Test/X_train_tuned.csv",index=False) # st.write(X_train_tuned[['total_approved_accounts_revenue'] + new_features].dtypes) # st.write(X_train_tuned[['total_approved_accounts_revenue', panel_col] + new_features].isna().sum()) md_str = target_col + " ~ " + inp_vars_str md_tuned = smf.mixedlm(md_str, data=X_train_tuned[[target_col] + new_features], groups=X_train_tuned[panel_col]) model_tuned = md_tuned.fit() # plot act v pred for original model and tuned model metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train[date_col], y_train, model.fittedvalues, model, target_column=sel_target_col, is_panel=True) metrics_table_tuned, line, actual_vs_predicted_plot_tuned = plot_actual_vs_predicted(X_train_tuned[date_col], X_train_tuned[target_col], model_tuned.fittedvalues, model_tuned, target_column=sel_target_col, is_panel=True) else: new_features=list(set(new_features)) # st.write(new_features) model_tuned = sm.OLS(y_train, X_train_tuned[new_features]).fit() # st.write(X_train_tuned.columns) metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(date[:130], y_train, model.predict(X_train[features_set]), model, target_column=sel_target_col) metrics_table_tuned, line, actual_vs_predicted_plot_tuned = plot_actual_vs_predicted(date[:130], y_train, model_tuned.predict( X_train_tuned), model_tuned, target_column=sel_target_col) # st.write(metrics_table_tuned) mape = np.round(metrics_table.iloc[0, 1], 2) r2 = np.round(metrics_table.iloc[1, 1], 2) adjr2 = np.round(metrics_table.iloc[2, 1], 2) mape_tuned = np.round(metrics_table_tuned.iloc[0, 1], 2) r2_tuned = np.round(metrics_table_tuned.iloc[1, 1], 2) adjr2_tuned = np.round(metrics_table_tuned.iloc[2, 1], 2) parameters_ = st.columns(3) with parameters_[0]: st.metric('R2', r2_tuned, np.round(r2_tuned - r2, 2)) with parameters_[1]: st.metric('Adjusted R2', adjr2_tuned, np.round(adjr2_tuned - adjr2, 2)) with parameters_[2]: st.metric('MAPE', mape_tuned, np.round(mape_tuned - mape, 2), 'inverse') st.write(model_tuned.summary()) X_train_tuned[date_col] = X_train[date_col] X_test_tuned[date_col] = X_test[date_col] X_train_tuned[target_col] = y_train X_test_tuned[target_col] = y_test st.header('2.2 Actual vs. Predicted Plot') # if is_panel: # metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(date, y_train, model.predict(X_train), # model, target_column='Revenue',is_panel=True) # else: # metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.predict(X_train), model,target_column='Revenue') if is_panel : metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train_tuned[date_col], X_train_tuned[target_col], model_tuned.fittedvalues, model_tuned, target_column=sel_target_col, is_panel=True) else : metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train_tuned[date_col], X_train_tuned[target_col], model_tuned.predict(X_train_tuned[new_features]), model_tuned, target_column=sel_target_col, is_panel=False) # plot_actual_vs_predicted(X_train[date_col], y_train, # model.fittedvalues, model, # target_column='Revenue', # is_panel=is_panel) st.plotly_chart(actual_vs_predicted_plot, use_container_width=True) st.markdown('## 2.3 Residual Analysis') if is_panel : columns = st.columns(2) with columns[0]: fig = plot_residual_predicted(y_train, model_tuned.fittedvalues, X_train_tuned) st.plotly_chart(fig) with columns[1]: st.empty() fig = qqplot(y_train, model_tuned.fittedvalues) st.plotly_chart(fig) with columns[0]: fig = residual_distribution(y_train, model_tuned.fittedvalues) st.pyplot(fig) else: columns = st.columns(2) with columns[0]: fig = plot_residual_predicted(y_train, model_tuned.predict(X_train_tuned[new_features]), X_train) st.plotly_chart(fig) with columns[1]: st.empty() fig = qqplot(y_train, model_tuned.predict(X_train_tuned[new_features])) st.plotly_chart(fig) with columns[0]: fig = residual_distribution(y_train, model_tuned.predict(X_train_tuned[new_features])) st.pyplot(fig) st.session_state['is_tuned_model'][target_col] = True # Sprint4 - saved tuned model in a dict st.session_state['Model_Tuned'][sel_model + "__" + target_col] = { "Model_object": model_tuned, 'feature_set': new_features, 'X_train_tuned': X_train_tuned, 'X_test_tuned': X_test_tuned } # Pending # if st.session_state['build_tuned_model']==True: if st.session_state['Model_Tuned'] is not None : if st.checkbox('Use this model to build response curves', key='save_model'): # save_model = st.button('Use this model to build response curves', key='saved_tuned_model') # if save_model: st.session_state["is_tuned_model"][target_col]=True with open("tuned_model.pkl", "wb") as f: # pickle.dump(st.session_state['tuned_model'], f) pickle.dump(st.session_state['Model_Tuned'], f) # Sprint4 # X_test_tuned.to_csv("Test/X_test_tuned_final.csv", index=False) # X_train_tuned.to_csv("Test/X_train_tuned.csv", index=False) st.success(sel_model + "__" + target_col + ' Tuned saved!') # if is_panel: # # st.session_state["tuned_model_features"] = new_features # with open("tuned_model.pkl", "wb") as f: # # pickle.dump(st.session_state['tuned_model'], f) # pickle.dump(st.session_state['Model_Tuned'], f) # Sprint4 # st.success(sel_model + "__" + target_col + ' Tuned saved!') # raw_data=df[features_set] # columns_raw=[re.split(r"(_lag|_adst)",col)[0] for col in raw_data.columns] # raw_data.columns=columns_raw # columns_media=[col for col in columns_raw if Categorised_data[col]['BB']=='Media'] # raw_data=raw_data[columns_media] # raw_data['Date']=list(df.index) # spends_var=[col for col in df.columns if "spends" in col.lower() and 'adst' not in col.lower() and 'lag' not in col.lower()] # spends_df=df[spends_var] # spends_df['Week']=list(df.index) # j=0 # X1=X.copy() # col=X1.columns # for i in model.params.values: # X1[col[j]]=X1.iloc[:,j]*i # j+=1 # contribution_df=X1 # contribution_df['Date']=list(df.index) # excel_file='Overview_data.xlsx' # with pd.ExcelWriter(excel_file,engine='xlsxwriter') as writer: # raw_data.to_excel(writer,sheet_name='RAW DATA MMM',index=False) # spends_df.to_excel(writer,sheet_name='SPEND INPUT',index=False) # contribution_df.to_excel(writer,sheet_name='CONTRIBUTION MMM')