''' MMO Build Sprint 3 additions : adding more variables to session state for saved model : random effect, predicted train & test MMO Build Sprint 4 additions : ability to run models for different response metrics ''' import streamlit as st import pandas as pd import plotly.express as px import plotly.graph_objects as go from Eda_functions import format_numbers import numpy as np import pickle from st_aggrid import AgGrid from st_aggrid import GridOptionsBuilder, GridUpdateMode from utilities import set_header, load_local_css from st_aggrid import GridOptionsBuilder import time import itertools import statsmodels.api as sm import numpy as npc import re import itertools from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error from sklearn.preprocessing import MinMaxScaler import os import matplotlib.pyplot as plt from statsmodels.stats.outliers_influence import variance_inflation_factor st.set_option('deprecation.showPyplotGlobalUse', False) import statsmodels.api as sm import statsmodels.formula.api as smf from datetime import datetime import seaborn as sns from Data_prep_functions import * def get_random_effects(media_data, panel_col, mdf): random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"]) for i, market in enumerate(media_data[panel_col].unique()): print(i, end='\r') intercept = mdf.random_effects[market].values[0] random_eff_df.loc[i, 'random_effect'] = intercept random_eff_df.loc[i, panel_col] = market return random_eff_df def mdf_predict(X_df, mdf, random_eff_df): X = X_df.copy() X['fixed_effect'] = mdf.predict(X) X = pd.merge(X, random_eff_df, on=panel_col, how='left') X['pred'] = X['fixed_effect'] + X['random_effect'] # X.to_csv('Test/megred_df.csv',index=False) X.drop(columns=['fixed_effect', 'random_effect'], inplace=True) return X['pred'] st.set_page_config( page_title="Model Build", page_icon=":shark:", layout="wide", initial_sidebar_state='collapsed' ) load_local_css('styles.css') set_header() st.title('1. Build Your Model') with open("data_import.pkl", "rb") as f: data = pickle.load(f) st.session_state['bin_dict'] = data["bin_dict"] #st.write(data["bin_dict"]) with open("final_df_transformed.pkl", "rb") as f: data = pickle.load(f) # Accessing the loaded objects media_data = data["final_df_transformed"] # Sprint4 - available response metrics is a list of all reponse metrics in the data ## these will be put in a drop down st.session_state['media_data']=media_data if 'available_response_metrics' not in st.session_state: # st.session_state['available_response_metrics'] = ['Total Approved Accounts - Revenue', # 'Total Approved Accounts - Appsflyer', # 'Account Requests - Appsflyer', # 'App Installs - Appsflyer'] st.session_state['available_response_metrics']= st.session_state['bin_dict']["Response Metrics"] # Sprint4 if "is_tuned_model" not in st.session_state: st.session_state["is_tuned_model"] = {} for resp_metric in st.session_state['available_response_metrics'] : resp_metric=resp_metric.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") st.session_state["is_tuned_model"][resp_metric] = False # Sprint4 - used_response_metrics is a list of resp metrics for which user has created & saved a model if 'used_response_metrics' not in st.session_state: st.session_state['used_response_metrics'] = [] # Sprint4 - saved_model_names if 'saved_model_names' not in st.session_state: st.session_state['saved_model_names'] = [] # if "model_save_flag" not in st.session_state: # st.session_state["model_save_flag"]=False # def reset_save(): # st.session_state["model_save_flag"]=False # def set_save(): # st.session_state["model_save_flag"]=True # Sprint4 - select a response metric sel_target_col = st.selectbox("Select the response metric", st.session_state['available_response_metrics']) # , on_change=reset_save()) target_col = sel_target_col.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") new_name_dct={col:col.lower().replace('.','_').lower().replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in media_data.columns} media_data.columns=[col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in media_data.columns] #st.write(st.session_state['bin_dict']) panel_col = [col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in st.session_state['bin_dict']['Panel Level 1'] ] [0]# set the panel column date_col = 'date' #st.write(media_data) is_panel = True if len(panel_col)>0 else False if 'is_panel' not in st.session_state: st.session_state['is_panel']=False # if st.toggle('Apply Transformations on DMA/Panel Level'): # media_data = pd.read_csv(r'C:\Users\SrishtiVerma\Mastercard\Sprint2\upf_data_converted_randomized_resp_metrics.csv') # media_data.columns = [i.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for i in # media_data.columns] # dma = st.selectbox('Select the Level of data ', # [col for col in media_data.columns if col.lower() in ['dma', 'panel', 'markets']]) # # is_panel = True # # st.session_state['is_panel']=True # # else: # # """ code to aggregate data on date """ # media_data = pd.read_excel(r'C:\Users\SrishtiVerma\Mastercard\Sprint1\Tactic Level Models\Tactic_level_data_imp_clicks_spends.xlsx') # media_data.columns = [i.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for i in # media_data.columns] # dma = None # # is_panel = False # # st.session_state['is_panel']=False #media_data = st.session_state["final_df"] # st.write(media_data.columns) media_data.sort_values(date_col, inplace=True) media_data.reset_index(drop=True, inplace=True) date = media_data[date_col] st.session_state['date'] = date # revenue=media_data[target_col] y = media_data[target_col] if is_panel: spends_data = media_data[ [c for c in media_data.columns if "_cost" in c.lower() or "_spend" in c.lower()] + [date_col, panel_col]] # Sprint3 - spends for resp curves else: spends_data = media_data[ [c for c in media_data.columns if "_cost" in c.lower() or "_spend" in c.lower()] + [date_col]] y = media_data[target_col] # media_data.drop([target_col],axis=1,inplace=True) media_data.drop([date_col], axis=1, inplace=True) media_data.reset_index(drop=True, inplace=True) # dma_dict={ dm:media_data[media_data[dma]==dm] for dm in media_data[dma].unique()} # st.markdown('## Select the Range of Transformations') columns = st.columns(2) old_shape = media_data.shape if "old_shape" not in st.session_state: st.session_state['old_shape'] = old_shape # with columns[0]: # slider_value_adstock = st.slider('Select Adstock Range (only applied to media)', 0.0, 1.0, (0.2, 0.4), step=0.1, # format="%.2f") # with columns[1]: # slider_value_lag = st.slider('Select Lag Range (applied to media, seasonal, macroeconomic variables)', 1, 7, (1, 3), # step=1) # with columns[2]: # slider_value_power=st.slider('Select Power range (only applied to media )',0,4,(1,2),step=1) # with columns[1]: # st.number_input('Select the range of half saturation point ',min_value=1,max_value=5) # st.number_input('Select the range of ') # Section 1 - Transformations Functions # def lag(data, features, lags, dma=None): # if dma: # # transformed_data = pd.concat( # [data.groupby([dma])[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags], axis=1) # # transformed_data = transformed_data.fillna(method='bfill') # transformed_data = transformed_data.bfill() # Sprint4 - fillna getting deprecated # return pd.concat([transformed_data, data], axis=1) # # else: # # # ''' data should be aggregated on date''' # # transformed_data = pd.concat([data[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags], axis=1) # # transformed_data = transformed_data.fillna(method='bfill') # transformed_data = transformed_data.bfill() # # return pd.concat([transformed_data, data], axis=1) # # # # adstock # def adstock(df, alphas, cutoff, features, dma=None): # if dma: # transformed_data = pd.DataFrame() # for d in df[dma].unique(): # dma_sub_df = df[df[dma] == d] # n = len(dma_sub_df) # # weights = np.array( # [[[alpha ** (i - j) if i >= j and j >= i - cutoff else 0. for j in range(n)] for i in range(n)] for # alpha in alphas]) # X = dma_sub_df[features].to_numpy() # # res = pd.DataFrame(np.hstack(weights @ X), # columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features]) # # transformed_data = pd.concat([transformed_data, res], axis=0) # transformed_data.reset_index(drop=True, inplace=True) # return pd.concat([transformed_data, df], axis=1) # # else: # # n = len(df) # # weights = np.array( # [[[alpha ** (i - j) if i >= j and j >= i - cutoff else 0. for j in range(n)] for i in range(n)] for alpha in # alphas]) # # X = df[features].to_numpy() # res = pd.DataFrame(np.hstack(weights @ X), # columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features]) # return pd.concat([res, df], axis=1) # Section 2 - Begin Transformations if 'media_data' not in st.session_state: st.session_state['media_data'] = pd.DataFrame() # Sprint3 if "orig_media_data" not in st.session_state: st.session_state['orig_media_data'] = pd.DataFrame() # Sprint3 additions if 'random_effects' not in st.session_state: st.session_state['random_effects'] = pd.DataFrame() if 'pred_train' not in st.session_state: st.session_state['pred_train'] = [] if 'pred_test' not in st.session_state: st.session_state['pred_test'] = [] # end of Sprint3 additions # variables_to_be_transformed=[col for col in media_data.columns if col.lower() not in ['dma','panel'] ] # change for buckets # variables_to_be_transformed = [col for col in media_data.columns if # '_clicks' in col.lower() or '_impress' in col.lower()] # srishti - change # # with columns[0]: # if st.button('Apply Transformations'): # with st.spinner('Applying Transformations'): # transformed_data_lag = lag(media_data, features=variables_to_be_transformed, # lags=np.arange(slider_value_lag[0], slider_value_lag[1] + 1, 1), dma=dma) # # # variables_to_be_transformed=[col for col in list(transformed_data_lag.columns) if col not in ['Date','DMA','Panel']] #change for buckets # variables_to_be_transformed = [col for col in media_data.columns if # '_clicks' in col.lower() or '_impress' in col.lower()] # srishti - change # # transformed_data_adstock = adstock(df=transformed_data_lag, # alphas=np.arange(slider_value_adstock[0], slider_value_adstock[1], 0.1), # cutoff=8, features=variables_to_be_transformed, dma=dma) # # # st.success('Done') # st.success("Transformations complete!") # # st.write(f'old shape {old_shape}, new shape {transformed_data_adstock.shape}') # # transformed_data_adstock.columns = [c.replace(".", "_") for c in # transformed_data_adstock.columns] # srishti # st.session_state['media_data'] = transformed_data_adstock # srishti # # Sprint3 # orig_media_data = media_data.copy() # orig_media_data[date_col] = date # orig_media_data[target_col] = y # st.session_state['orig_media_data'] = orig_media_data # srishti # # # with st.spinner('Applying Transformations'): # # time.sleep(2) # # st.success("Transformations complete!") # # # if st.session_state['media_data'].shape[1]>old_shape[1]: # # with columns[0]: # # st.write(f'Total no.of variables before transformation: {old_shape[1]}, Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}') # # st.write(f'Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}') # Section 3 - Create combinations # bucket=['paid_search', 'kwai','indicacao','infleux', 'influencer','FB: Level Achieved - Tier 1 Impressions', # ' FB: Level Achieved - Tier 2 Impressions','paid_social_others', # ' GA App: Will And Cid Pequena Baixo Risco Clicks', # 'digital_tactic_others',"programmatic" # ] # srishti - bucket names changed bucket = ['paid_search', 'kwai', 'indicacao', 'infleux', 'influencer', 'fb_level_achieved_tier_2', 'fb_level_achieved_tier_1', 'paid_social_others', 'ga_app', 'digital_tactic_others', "programmatic" ] with columns[0]: if st.button('Create Combinations of Variables'): top_3_correlated_features = [] # # for col in st.session_state['media_data'].columns[:19]: # original_cols = [c for c in st.session_state['media_data'].columns if # "_clicks" in c.lower() or "_impressions" in c.lower()] #original_cols = [c for c in original_cols if "_lag" not in c.lower() and "_adstock" not in c.lower()] original_cols=st.session_state['bin_dict']['Media'] + st.session_state['bin_dict']['Internal'] original_cols=[col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in original_cols] #st.write(original_cols) # for col in st.session_state['media_data'].columns[:19]: for col in original_cols: # srishti - new corr_df = pd.concat([st.session_state['media_data'].filter(regex=col), y], axis=1).corr()[target_col].iloc[:-1] top_3_correlated_features.append(list(corr_df.sort_values(ascending=False).head(2).index)) flattened_list = [item for sublist in top_3_correlated_features for item in sublist] # all_features_set={var:[col for col in flattened_list if var in col] for var in bucket} all_features_set = {var: [col for col in flattened_list if var in col] for var in bucket if len([col for col in flattened_list if var in col]) > 0} # srishti channels_all = [values for values in all_features_set.values()] st.session_state['combinations'] = list(itertools.product(*channels_all)) # if 'combinations' not in st.session_state: # st.session_state['combinations']=combinations_all st.session_state['final_selection'] = st.session_state['combinations'] st.success('Done') # revenue.reset_index(drop=True,inplace=True) y.reset_index(drop=True, inplace=True) if 'Model_results' not in st.session_state: st.session_state['Model_results'] = {'Model_object': [], 'Model_iteration': [], 'Feature_set': [], 'MAPE': [], 'R2': [], 'ADJR2': [], 'pos_count': [] } def reset_model_result_dct(): st.session_state['Model_results'] = {'Model_object': [], 'Model_iteration': [], 'Feature_set': [], 'MAPE': [], 'R2': [], 'ADJR2': [], 'pos_count': [] } # if st.button('Build Model'): if 'iterations' not in st.session_state: st.session_state['iterations'] = 0 if 'final_selection' not in st.session_state: st.session_state['final_selection'] = False save_path = r"Model/" with columns[1]: if st.session_state['final_selection']: st.write(f'Total combinations created {format_numbers(len(st.session_state["final_selection"]))}') if st.checkbox('Build all iterations'): iterations = len(st.session_state['final_selection']) else: iterations = st.number_input('Select the number of iterations to perform', min_value=0, step=100, value=st.session_state['iterations'], on_change=reset_model_result_dct) # st.write("iterations=", iterations) if st.button('Build Model', on_click=reset_model_result_dct): st.session_state['iterations'] = iterations # Section 4 - Model # st.session_state['media_data'] = st.session_state['media_data'].fillna(method='ffill') st.session_state['media_data'] = st.session_state['media_data'].ffill() st.markdown( 'Data Split -- Training Period: May 9th, 2023 - October 5th,2023 , Testing Period: October 6th, 2023 - November 7th, 2023 ') progress_bar = st.progress(0) # Initialize the progress bar # time_remaining_text = st.empty() # Create an empty space for time remaining text start_time = time.time() # Record the start time progress_text = st.empty() # time_elapsed_text = st.empty() # for i, selected_features in enumerate(st.session_state["final_selection"][40000:40000 + int(iterations)]): # st.write(st.session_state["final_selection"]) # for i, selected_features in enumerate(st.session_state["final_selection"]): if is_panel == True: for i, selected_features in enumerate(st.session_state["final_selection"][0:int(iterations)]): # srishti df = st.session_state['media_data'] fet = [var for var in selected_features if len(var) > 0] inp_vars_str = " + ".join(fet) # new X = df[fet] y = df[target_col] ss = MinMaxScaler() X = pd.DataFrame(ss.fit_transform(X), columns=X.columns) X[target_col] = y # Sprint2 X[panel_col] = df[panel_col] # Sprint2 X_train = X.iloc[:8000] X_test = X.iloc[8000:] y_train = y.iloc[:8000] y_test = y.iloc[8000:] print(X_train.shape) # model = sm.OLS(y_train, X_train).fit() md_str = target_col + " ~ " + inp_vars_str # md = smf.mixedlm("total_approved_accounts_revenue ~ {}".format(inp_vars_str), # data=X_train[[target_col] + fet], # groups=X_train[panel_col]) md = smf.mixedlm(md_str, data=X_train[[target_col] + fet], groups=X_train[panel_col]) mdf = md.fit() predicted_values = mdf.fittedvalues coefficients = mdf.fe_params.to_dict() model_positive = [col for col in coefficients.keys() if coefficients[col] > 0] pvalues = [var for var in list(mdf.pvalues) if var <= 0.06] if (len(model_positive) / len(selected_features)) > 0 and ( len(pvalues) / len(selected_features)) >= 0: # srishti - changed just for testing, revert later # predicted_values = model.predict(X_train) mape = mean_absolute_percentage_error(y_train, predicted_values) r2 = r2_score(y_train, predicted_values) adjr2 = 1 - (1 - r2) * (len(y_train) - 1) / (len(y_train) - len(selected_features) - 1) filename = os.path.join(save_path, f"model_{i}.pkl") with open(filename, "wb") as f: pickle.dump(mdf, f) # with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file: # model = pickle.load(file) st.session_state['Model_results']['Model_object'].append(filename) st.session_state['Model_results']['Model_iteration'].append(i) st.session_state['Model_results']['Feature_set'].append(fet) st.session_state['Model_results']['MAPE'].append(mape) st.session_state['Model_results']['R2'].append(r2) st.session_state['Model_results']['pos_count'].append(len(model_positive)) st.session_state['Model_results']['ADJR2'].append(adjr2) current_time = time.time() time_taken = current_time - start_time time_elapsed_minutes = time_taken / 60 completed_iterations_text = f"{i + 1}/{iterations}" progress_bar.progress((i + 1) / int(iterations)) progress_text.text( f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}') st.write( f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models') else: for i, selected_features in enumerate(st.session_state["final_selection"][0:int(iterations)]): # srishti df = st.session_state['media_data'] fet = [var for var in selected_features if len(var) > 0] inp_vars_str = " + ".join(fet) X = df[fet] y = df[target_col] ss = MinMaxScaler() X = pd.DataFrame(ss.fit_transform(X), columns=X.columns) X = sm.add_constant(X) X_train = X.iloc[:130] X_test = X.iloc[130:] y_train = y.iloc[:130] y_test = y.iloc[130:] model = sm.OLS(y_train, X_train).fit() coefficients = model.params.to_list() model_positive = [coef for coef in coefficients if coef > 0] predicted_values = model.predict(X_train) pvalues = [var for var in list(model.pvalues) if var <= 0.06] # if (len(model_possitive) / len(selected_features)) > 0.9 and (len(pvalues) / len(selected_features)) >= 0.8: if (len(model_positive) / len(selected_features)) > 0 and (len(pvalues) / len( selected_features)) >= 0.5: # srishti - changed just for testing, revert later VALID MODEL CRITERIA # predicted_values = model.predict(X_train) mape = mean_absolute_percentage_error(y_train, predicted_values) adjr2 = model.rsquared_adj r2 = model.rsquared filename = os.path.join(save_path, f"model_{i}.pkl") with open(filename, "wb") as f: pickle.dump(model, f) # with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file: # model = pickle.load(file) st.session_state['Model_results']['Model_object'].append(filename) st.session_state['Model_results']['Model_iteration'].append(i) st.session_state['Model_results']['Feature_set'].append(fet) st.session_state['Model_results']['MAPE'].append(mape) st.session_state['Model_results']['R2'].append(r2) st.session_state['Model_results']['ADJR2'].append(adjr2) st.session_state['Model_results']['pos_count'].append(len(model_positive)) current_time = time.time() time_taken = current_time - start_time time_elapsed_minutes = time_taken / 60 completed_iterations_text = f"{i + 1}/{iterations}" progress_bar.progress((i + 1) / int(iterations)) progress_text.text( f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}') st.write( f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models') pd.DataFrame(st.session_state['Model_results']).to_csv('model_output.csv') def to_percentage(value): return f'{value * 100:.1f}%' ## Section 5 - Select Model st.title('2. Select Models') if 'tick' not in st.session_state: st.session_state['tick'] = False if st.checkbox('Show results of top 10 models (based on MAPE and Adj. R2)', value=st.session_state['tick']): st.session_state['tick'] = True st.write('Select one model iteration to generate performance metrics for it:') data = pd.DataFrame(st.session_state['Model_results']) data = data[data['pos_count']==data['pos_count'].max()].reset_index(drop=True) # Sprint4 -- Srishti -- only show models with the lowest num of neg coeffs data.sort_values(by=['ADJR2'], ascending=False, inplace=True) data.drop_duplicates(subset='Model_iteration', inplace=True) top_10 = data.head(10) top_10['Rank'] = np.arange(1, len(top_10) + 1, 1) top_10[['MAPE', 'R2', 'ADJR2']] = np.round(top_10[['MAPE', 'R2', 'ADJR2']], 4).applymap(to_percentage) top_10_table = top_10[['Rank', 'Model_iteration', 'MAPE', 'ADJR2', 'R2']] # top_10_table.columns=[['Rank','Model Iteration Index','MAPE','Adjusted R2','R2']] gd = GridOptionsBuilder.from_dataframe(top_10_table) gd.configure_pagination(enabled=True) gd.configure_selection( use_checkbox=True, selection_mode="single", pre_select_all_rows=False, pre_selected_rows=[1], ) gridoptions = gd.build() table = AgGrid(top_10, gridOptions=gridoptions, update_mode=GridUpdateMode.SELECTION_CHANGED) selected_rows = table.selected_rows # if st.session_state["selected_rows"] != selected_rows: # st.session_state["build_rc_cb"] = False st.session_state["selected_rows"] = selected_rows if 'Model' not in st.session_state: st.session_state['Model'] = {} # Section 6 - Display Results if len(selected_rows) > 0: st.header('2.1 Results Summary') model_object = data[data['Model_iteration'] == selected_rows[0]['Model_iteration']]['Model_object'] features_set = data[data['Model_iteration'] == selected_rows[0]['Model_iteration']]['Feature_set'] with open(str(model_object.values[0]), 'rb') as file: # print(file) model = pickle.load(file) st.write(model.summary()) st.header('2.2 Actual vs. Predicted Plot') if is_panel : df = st.session_state['media_data'] X = df[features_set.values[0]] y = df[target_col] ss = MinMaxScaler() X = pd.DataFrame(ss.fit_transform(X), columns=X.columns) # Sprint2 changes X[target_col] = y # new X[panel_col] = df[panel_col] X[date_col] = date X_train = X.iloc[:8000] X_test = X.iloc[8000:].reset_index(drop=True) y_train = y.iloc[:8000] y_test = y.iloc[8000:].reset_index(drop=True) test_spends = spends_data[8000:] # Sprint3 - test spends for resp curves random_eff_df = get_random_effects(media_data, panel_col, model) train_pred = model.fittedvalues test_pred = mdf_predict(X_test, model, random_eff_df) print("__" * 20, test_pred.isna().sum()) else : df = st.session_state['media_data'] X = df[features_set.values[0]] y = df[target_col] ss = MinMaxScaler() X = pd.DataFrame(ss.fit_transform(X), columns=X.columns) X = sm.add_constant(X) X[date_col] = date X_train = X.iloc[:130] X_test = X.iloc[130:].reset_index(drop=True) y_train = y.iloc[:130] y_test = y.iloc[130:].reset_index(drop=True) test_spends = spends_data[130:] # Sprint3 - test spends for resp curves train_pred = model.predict(X_train[features_set.values[0]+['const']]) test_pred = model.predict(X_test[features_set.values[0]+['const']]) # save x test to test - srishti x_test_to_save = X_test.copy() x_test_to_save['Actuals'] = y_test x_test_to_save['Predictions'] = test_pred x_train_to_save = X_train.copy() x_train_to_save['Actuals'] = y_train x_train_to_save['Predictions'] = train_pred x_train_to_save.to_csv('Test/x_train_to_save.csv', index=False) x_test_to_save.to_csv('Test/x_test_to_save.csv', index=False) st.session_state['X'] = X_train st.session_state['features_set'] = features_set.values[0] print("**" * 20, "selected model features : ", features_set.values[0]) metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train[date_col], y_train, train_pred, model, target_column=sel_target_col, is_panel=is_panel) # Sprint2 st.plotly_chart(actual_vs_predicted_plot, use_container_width=True) st.markdown('## 2.3 Residual Analysis') columns = st.columns(2) with columns[0]: fig = plot_residual_predicted(y_train, train_pred, X_train) # Sprint2 st.plotly_chart(fig) with columns[1]: st.empty() fig = qqplot(y_train, train_pred) # Sprint2 st.plotly_chart(fig) with columns[0]: fig = residual_distribution(y_train, train_pred) # Sprint2 st.pyplot(fig) vif_data = pd.DataFrame() # X=X.drop('const',axis=1) X_train_orig = X_train.copy() # Sprint2 -- creating a copy of xtrain. Later deleting panel, target & date from xtrain del_col_list = list(set([target_col, panel_col, date_col]).intersection(list(X_train.columns))) X_train.drop(columns=del_col_list, inplace=True) # Sprint2 vif_data["Variable"] = X_train.columns vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])] vif_data.sort_values(by=['VIF'], ascending=False, inplace=True) vif_data = np.round(vif_data) vif_data['VIF'] = vif_data['VIF'].astype(float) st.header('2.4 Variance Inflation Factor (VIF)') # st.dataframe(vif_data) color_mapping = { 'darkgreen': (vif_data['VIF'] < 3), 'orange': (vif_data['VIF'] >= 3) & (vif_data['VIF'] <= 10), 'darkred': (vif_data['VIF'] > 10) } # Create a horizontal bar plot fig, ax = plt.subplots() fig.set_figwidth(10) # Adjust the width of the figure as needed # Sort the bars by descending VIF values vif_data = vif_data.sort_values(by='VIF', ascending=False) # Iterate through the color mapping and plot bars with corresponding colors for color, condition in color_mapping.items(): subset = vif_data[condition] bars = ax.barh(subset["Variable"], subset["VIF"], color=color, label=color) # Add text annotations on top of the bars for bar in bars: width = bar.get_width() ax.annotate(f'{width:}', xy=(width, bar.get_y() + bar.get_height() / 2), xytext=(5, 0), textcoords='offset points', va='center') # Customize the plot ax.set_xlabel('VIF Values') # ax.set_title('2.4 Variance Inflation Factor (VIF)') # ax.legend(loc='upper right') # Display the plot in Streamlit st.pyplot(fig) with st.expander('Results Summary Test data'): # ss = MinMaxScaler() # X_test = pd.DataFrame(ss.fit_transform(X_test), columns=X_test.columns) st.header('2.2 Actual vs. Predicted Plot') metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_test[date_col], y_test, test_pred, model, target_column=sel_target_col, is_panel=is_panel) # Sprint2 st.plotly_chart(actual_vs_predicted_plot, use_container_width=True) st.markdown('## 2.3 Residual Analysis') columns = st.columns(2) with columns[0]: fig = plot_residual_predicted(y, test_pred, X_test) # Sprint2 st.plotly_chart(fig) with columns[1]: st.empty() fig = qqplot(y, test_pred) # Sprint2 st.plotly_chart(fig) with columns[0]: fig = residual_distribution(y, test_pred) # Sprint2 st.pyplot(fig) value = False save_button_model = st.checkbox('Save this model to tune', key='build_rc_cb') # , on_click=set_save()) if save_button_model: mod_name = st.text_input('Enter model name') if len(mod_name) > 0: mod_name = mod_name + "__" + target_col # Sprint4 - adding target col to model name if is_panel : pred_train= model.fittedvalues pred_test= mdf_predict(X_test, model, random_eff_df) else : st.session_state['features_set'] = st.session_state['features_set'] + ['const'] pred_train= model.predict(X_train_orig[st.session_state['features_set']]) pred_test= model.predict(X_test[st.session_state['features_set']]) st.session_state['Model'][mod_name] = {"Model_object": model, 'feature_set': st.session_state['features_set'], 'X_train': X_train_orig, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test, 'pred_train':pred_train, 'pred_test': pred_test } st.session_state['X_train'] = X_train_orig # st.session_state['X_test'] = X_test # st.session_state['y_train'] = y_train # st.session_state['y_test'] = y_test st.session_state['X_test_spends'] = test_spends # st.session_state['base_model'] = model # st.session_state['base_model_feature_set'] = st.session_state['features_set'] st.session_state['saved_model_names'].append(mod_name) # Sprint3 additions if is_panel : random_eff_df = get_random_effects(media_data, panel_col, model) st.session_state['random_effects'] = random_eff_df # st.session_state['pred_train'] = model.fittedvalues # st.session_state['pred_test'] = mdf_predict(X_test, model, random_eff_df) # # End of Sprint3 additions with open("best_models.pkl", "wb") as f: pickle.dump(st.session_state['Model'], f) st.success(mod_name + ' model saved! Proceed to the next page to tune the model') urm = st.session_state['used_response_metrics'] urm.append(sel_target_col) st.session_state['used_response_metrics'] = list(set(urm)) mod_name = "" # Sprint4 - add the formatted name of the target col to used resp metrics value = False