Spaces:
Sleeping
Sleeping
| ''' | |
| MMO Build Sprint 3 | |
| additions : adding more variables to session state for saved model : random effect, predicted train & test | |
| MMO Build Sprint 4 | |
| additions : ability to run models for different response metrics | |
| ''' | |
| import streamlit as st | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from Eda_functions import format_numbers | |
| import numpy as np | |
| import pickle | |
| from st_aggrid import AgGrid | |
| from st_aggrid import GridOptionsBuilder, GridUpdateMode | |
| from utilities import set_header, load_local_css | |
| from st_aggrid import GridOptionsBuilder | |
| import time | |
| import itertools | |
| import statsmodels.api as sm | |
| import numpy as npc | |
| import re | |
| import itertools | |
| from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error | |
| from sklearn.preprocessing import MinMaxScaler | |
| import os | |
| import matplotlib.pyplot as plt | |
| from statsmodels.stats.outliers_influence import variance_inflation_factor | |
| st.set_option('deprecation.showPyplotGlobalUse', False) | |
| import statsmodels.api as sm | |
| import statsmodels.formula.api as smf | |
| from datetime import datetime | |
| import seaborn as sns | |
| from Data_prep_functions import * | |
| def save_to_pickle(file_path, final_df): | |
| # Open the file in write-binary mode and dump the objects | |
| with open(file_path, "wb") as f: | |
| pickle.dump({"final_df_transformed": final_df}, f) | |
| def get_random_effects(media_data, panel_col, mdf): | |
| random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"]) | |
| for i, market in enumerate(media_data[panel_col].unique()): | |
| print(i, end='\r') | |
| intercept = mdf.random_effects[market].values[0] | |
| random_eff_df.loc[i, 'random_effect'] = intercept | |
| random_eff_df.loc[i, panel_col] = market | |
| return random_eff_df | |
| def mdf_predict(X_df, mdf, random_eff_df): | |
| X = X_df.copy() | |
| X['fixed_effect'] = mdf.predict(X) | |
| X = pd.merge(X, random_eff_df, on=panel_col, how='left') | |
| X['pred'] = X['fixed_effect'] + X['random_effect'] | |
| # X.to_csv('Test/megred_df.csv',index=False) | |
| X.drop(columns=['fixed_effect', 'random_effect'], inplace=True) | |
| return X['pred'] | |
| st.set_page_config( | |
| page_title="Model Build", | |
| page_icon=":shark:", | |
| layout="wide", | |
| initial_sidebar_state='collapsed' | |
| ) | |
| load_local_css('styles.css') | |
| set_header() | |
| st.header(pd.__version__) | |
| st.title('1. Build Your Model') | |
| with open("data_import.pkl", "rb") as f: | |
| data = pickle.load(f) | |
| st.session_state['bin_dict'] = data["bin_dict"] | |
| #st.write(data["bin_dict"]) | |
| with open("final_df_transformed.pkl", "rb") as f: | |
| data = pickle.load(f) | |
| # Accessing the loaded objects | |
| media_data = data["final_df_transformed"] | |
| # Sprint4 - available response metrics is a list of all reponse metrics in the data | |
| ## these will be put in a drop down | |
| st.session_state['media_data']=media_data | |
| if 'available_response_metrics' not in st.session_state: | |
| # st.session_state['available_response_metrics'] = ['Total Approved Accounts - Revenue', | |
| # 'Total Approved Accounts - Appsflyer', | |
| # 'Account Requests - Appsflyer', | |
| # 'App Installs - Appsflyer'] | |
| st.session_state['available_response_metrics']=st.session_state['bin_dict']["Response Metrics"] | |
| # Sprint4 | |
| if "is_tuned_model" not in st.session_state: | |
| st.session_state["is_tuned_model"] = {} | |
| for resp_metric in st.session_state['available_response_metrics'] : | |
| resp_metric=resp_metric.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") | |
| st.session_state["is_tuned_model"][resp_metric] = False | |
| # Sprint4 - used_response_metrics is a list of resp metrics for which user has created & saved a model | |
| if 'used_response_metrics' not in st.session_state: | |
| st.session_state['used_response_metrics'] = [] | |
| # Sprint4 - saved_model_names | |
| if 'saved_model_names' not in st.session_state: | |
| st.session_state['saved_model_names'] = [] | |
| if 'Model' not in st.session_state: | |
| if "session_state_saved" in st.session_state["project_dct"]["model_build"].keys() and \ | |
| st.session_state["project_dct"]["model_build"]['session_state_saved'] is not None and \ | |
| 'Model' in st.session_state["project_dct"]["model_build"]["session_state_saved"].keys(): | |
| st.session_state['Model'] = st.session_state["project_dct"]["model_build"]["session_state_saved"]['Model'] | |
| else: | |
| st.session_state['Model'] = {} | |
| # Sprint4 - select a response metric | |
| default_target_idx = st.session_state["project_dct"]["model_build"].get("sel_target_col", None) if st.session_state["project_dct"]["model_build"].get("sel_target_col", None) is not None else st.session_state['available_response_metrics'][0] | |
| sel_target_col = st.selectbox("Select the response metric", | |
| st.session_state['available_response_metrics'], | |
| index=st.session_state['available_response_metrics'].index(default_target_idx)) | |
| # , on_change=reset_save()) | |
| st.session_state["project_dct"]["model_build"]["sel_target_col"] = sel_target_col | |
| target_col = sel_target_col.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") | |
| new_name_dct={col:col.lower().replace('.','_').lower().replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in media_data.columns} | |
| media_data.columns=[col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in media_data.columns] | |
| panel_col = [col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in st.session_state['bin_dict']['Panel Level 1']][0]# set the panel column | |
| date_col = 'date' | |
| is_panel = True if len(panel_col)>0 else False | |
| if 'is_panel' not in st.session_state: | |
| st.session_state['is_panel']=is_panel | |
| if is_panel : | |
| media_data.sort_values([date_col, panel_col], inplace=True) | |
| else : | |
| media_data.sort_values(date_col, inplace=True) | |
| media_data.reset_index(drop=True, inplace=True) | |
| date = media_data[date_col] | |
| st.session_state['date'] = date | |
| y = media_data[target_col] | |
| if is_panel: | |
| spends_data = media_data[ | |
| [c for c in media_data.columns if "_cost" in c.lower() or "_spend" in c.lower()] + [date_col, panel_col]] | |
| # Sprint3 - spends for resp curves | |
| else: | |
| spends_data = media_data[ | |
| [c for c in media_data.columns if "_cost" in c.lower() or "_spend" in c.lower()] + [date_col]] | |
| y = media_data[target_col] | |
| media_data.drop([date_col], axis=1, inplace=True) | |
| media_data.reset_index(drop=True, inplace=True) | |
| columns = st.columns(2) | |
| old_shape = media_data.shape | |
| if "old_shape" not in st.session_state: | |
| st.session_state['old_shape'] = old_shape | |
| if 'media_data' not in st.session_state: | |
| st.session_state['media_data'] = pd.DataFrame() | |
| # Sprint3 | |
| if "orig_media_data" not in st.session_state: | |
| st.session_state['orig_media_data'] = pd.DataFrame() | |
| # Sprint3 additions | |
| if 'random_effects' not in st.session_state: | |
| st.session_state['random_effects'] = pd.DataFrame() | |
| if 'pred_train' not in st.session_state: | |
| st.session_state['pred_train'] = [] | |
| if 'pred_test' not in st.session_state: | |
| st.session_state['pred_test'] = [] | |
| # end of Sprint3 additions | |
| # Section 3 - Create combinations | |
| # bucket=['paid_search', 'kwai','indicacao','infleux', 'influencer','FB: Level Achieved - Tier 1 Impressions', | |
| # ' FB: Level Achieved - Tier 2 Impressions','paid_social_others', | |
| # ' GA App: Will And Cid Pequena Baixo Risco Clicks', | |
| # 'digital_tactic_others',"programmatic" | |
| # ] | |
| # srishti - bucket names changed | |
| bucket = ['paid_search', 'kwai', 'indicacao', 'infleux', 'influencer', 'fb_level_achieved_tier_2', | |
| 'fb_level_achieved_tier_1', 'paid_social_others', | |
| 'ga_app', | |
| 'digital_tactic_others', "programmatic" | |
| ] | |
| with columns[0]: | |
| if st.button('Create Combinations of Variables'): | |
| top_3_correlated_features = [] | |
| # # for col in st.session_state['media_data'].columns[:19]: | |
| # original_cols = [c for c in st.session_state['media_data'].columns if | |
| # "_clicks" in c.lower() or "_impressions" in c.lower()] | |
| #original_cols = [c for c in original_cols if "_lag" not in c.lower() and "_adstock" not in c.lower()] | |
| original_cols=st.session_state['bin_dict']['Media'] + st.session_state['bin_dict']['Internal'] | |
| original_cols=[col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in original_cols] | |
| original_cols = [col for col in original_cols if "_cost" not in col] | |
| # for col in st.session_state['media_data'].columns[:19]: | |
| for col in original_cols: # srishti - new | |
| corr_df = pd.concat([st.session_state['media_data'].filter(regex=col), | |
| y], axis=1).corr()[target_col].iloc[:-1] | |
| top_3_correlated_features.append(list(corr_df.sort_values(ascending=False).head(2).index)) | |
| flattened_list = [item for sublist in top_3_correlated_features for item in sublist] | |
| # all_features_set={var:[col for col in flattened_list if var in col] for var in bucket} | |
| all_features_set = {var: [col for col in flattened_list if var in col] for var in bucket if | |
| len([col for col in flattened_list if var in col]) > 0} # srishti | |
| channels_all = [values for values in all_features_set.values()] | |
| st.session_state['combinations'] = list(itertools.product(*channels_all)) | |
| # if 'combinations' not in st.session_state: | |
| # st.session_state['combinations']=combinations_all | |
| st.session_state['final_selection'] = st.session_state['combinations'] | |
| st.success('Done') | |
| # revenue.reset_index(drop=True,inplace=True) | |
| y.reset_index(drop=True, inplace=True) | |
| if 'Model_results' not in st.session_state: | |
| st.session_state['Model_results'] = {'Model_object': [], | |
| 'Model_iteration': [], | |
| 'Feature_set': [], | |
| 'MAPE': [], | |
| 'R2': [], | |
| 'ADJR2': [], | |
| 'pos_count': [] | |
| } | |
| def reset_model_result_dct(): | |
| st.session_state['Model_results'] = {'Model_object': [], | |
| 'Model_iteration': [], | |
| 'Feature_set': [], | |
| 'MAPE': [], | |
| 'R2': [], | |
| 'ADJR2': [], | |
| 'pos_count': [] | |
| } | |
| # if st.button('Build Model'): | |
| if 'iterations' not in st.session_state: | |
| st.session_state['iterations'] = 0 | |
| if 'final_selection' not in st.session_state: | |
| st.session_state['final_selection'] = False | |
| save_path = r"Model/" | |
| with columns[1]: | |
| if st.session_state['final_selection']: | |
| st.write(f'Total combinations created {format_numbers(len(st.session_state["final_selection"]))}') | |
| # st.session_state["project_dct"]["model_build"]["all_iters_check"] = False | |
| checkbox_default = st.session_state["project_dct"]["model_build"]["all_iters_check"] if st.session_state["project_dct"]["model_build"]['all_iters_check'] is not None else False | |
| if st.checkbox('Build all iterations', value=checkbox_default): | |
| # st.session_state["project_dct"]["model_build"]["all_iters_check"] | |
| iterations = len(st.session_state['final_selection']) | |
| st.session_state["project_dct"]["model_build"]["all_iters_check"] = True | |
| else: | |
| iterations = st.number_input('Select the number of iterations to perform', min_value=0, step=100, | |
| value=st.session_state['iterations'], on_change=reset_model_result_dct) | |
| st.session_state["project_dct"]["model_build"]["all_iters_check"] = False | |
| st.session_state["project_dct"]["model_build"]["iterations"] = iterations | |
| if iterations <1: | |
| st.error('Please enter a number greater than 0') | |
| # st.stop() | |
| # build_button = st.session_state["project_dct"]["model_build"]["build_button"] if \ | |
| # "build_button" in st.session_state["project_dct"]["model_build"].keys() else False | |
| if st.button('Build Model', on_click=reset_model_result_dct): | |
| if len(st.session_state["final_selection"]) < 1 : | |
| st.error('Please create combinations') | |
| st.session_state["project_dct"]["model_build"]["build_button"]=True | |
| st.session_state['iterations'] = iterations | |
| # Section 4 - Model | |
| # st.session_state['media_data'] = st.session_state['media_data'].fillna(method='ffill') | |
| st.session_state['media_data'] = st.session_state['media_data'].ffill() | |
| st.markdown( | |
| 'Data Split -- Training Period: May 9th, 2023 - October 5th,2023 , Testing Period: October 6th, 2023 - November 7th, 2023 ') | |
| progress_bar = st.progress(0) # Initialize the progress bar | |
| # time_remaining_text = st.empty() # Create an empty space for time remaining text | |
| start_time = time.time() # Record the start time | |
| progress_text = st.empty() | |
| # time_elapsed_text = st.empty() | |
| # for i, selected_features in enumerate(st.session_state["final_selection"][40000:40000 + int(iterations)]): | |
| # st.write(st.session_state["final_selection"]) | |
| # for i, selected_features in enumerate(st.session_state["final_selection"]): | |
| if is_panel == True: | |
| for i, selected_features in enumerate(st.session_state["final_selection"][0:int(iterations)]): # srishti | |
| df = st.session_state['media_data'] | |
| fet = [var for var in selected_features if len(var) > 0] | |
| inp_vars_str = " + ".join(fet) # new | |
| X = df[fet] | |
| y = df[target_col] | |
| ss = MinMaxScaler() | |
| X = pd.DataFrame(ss.fit_transform(X), columns=X.columns) | |
| X[target_col] = y # Sprint2 | |
| X[panel_col] = df[panel_col] # Sprint2 | |
| X_train = X.iloc[:8000] | |
| X_test = X.iloc[8000:] | |
| y_train = y.iloc[:8000] | |
| y_test = y.iloc[8000:] | |
| print(X_train.shape) | |
| # model = sm.OLS(y_train, X_train).fit() | |
| md_str = target_col + " ~ " + inp_vars_str | |
| # md = smf.mixedlm("total_approved_accounts_revenue ~ {}".format(inp_vars_str), | |
| # data=X_train[[target_col] + fet], | |
| # groups=X_train[panel_col]) | |
| md = smf.mixedlm(md_str, | |
| data=X_train[[target_col] + fet], | |
| groups=X_train[panel_col]) | |
| mdf = md.fit() | |
| predicted_values = mdf.fittedvalues | |
| coefficients = mdf.fe_params.to_dict() | |
| model_positive = [col for col in coefficients.keys() if coefficients[col] > 0] | |
| pvalues = [var for var in list(mdf.pvalues) if var <= 0.06] | |
| if (len(model_positive) / len(selected_features)) > 0 and ( | |
| len(pvalues) / len(selected_features)) >= 0: # srishti - changed just for testing, revert later | |
| # predicted_values = model.predict(X_train) | |
| mape = mean_absolute_percentage_error(y_train, predicted_values) | |
| r2 = r2_score(y_train, predicted_values) | |
| adjr2 = 1 - (1 - r2) * (len(y_train) - 1) / (len(y_train) - len(selected_features) - 1) | |
| filename = os.path.join(save_path, f"model_{i}.pkl") | |
| with open(filename, "wb") as f: | |
| pickle.dump(mdf, f) | |
| # with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file: | |
| # model = pickle.load(file) | |
| st.session_state['Model_results']['Model_object'].append(filename) | |
| st.session_state['Model_results']['Model_iteration'].append(i) | |
| st.session_state['Model_results']['Feature_set'].append(fet) | |
| st.session_state['Model_results']['MAPE'].append(mape) | |
| st.session_state['Model_results']['R2'].append(r2) | |
| st.session_state['Model_results']['pos_count'].append(len(model_positive)) | |
| st.session_state['Model_results']['ADJR2'].append(adjr2) | |
| current_time = time.time() | |
| time_taken = current_time - start_time | |
| time_elapsed_minutes = time_taken / 60 | |
| completed_iterations_text = f"{i + 1}/{iterations}" | |
| progress_bar.progress((i + 1) / int(iterations)) | |
| progress_text.text( | |
| f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}') | |
| st.write( | |
| f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models') | |
| else: | |
| for i, selected_features in enumerate(st.session_state["final_selection"][0:int(iterations)]): # srishti | |
| df = st.session_state['media_data'] | |
| fet = [var for var in selected_features if len(var) > 0] | |
| inp_vars_str = " + ".join(fet) | |
| X = df[fet] | |
| y = df[target_col] | |
| ss = MinMaxScaler() | |
| X = pd.DataFrame(ss.fit_transform(X), columns=X.columns) | |
| X = sm.add_constant(X) | |
| X_train = X.iloc[:130] | |
| X_test = X.iloc[130:] | |
| y_train = y.iloc[:130] | |
| y_test = y.iloc[130:] | |
| model = sm.OLS(y_train, X_train).fit() | |
| coefficients = model.params.to_list() | |
| model_positive = [coef for coef in coefficients if coef > 0] | |
| predicted_values = model.predict(X_train) | |
| pvalues = [var for var in list(model.pvalues) if var <= 0.06] | |
| # if (len(model_possitive) / len(selected_features)) > 0.9 and (len(pvalues) / len(selected_features)) >= 0.8: | |
| if (len(model_positive) / len(selected_features)) > 0 and (len(pvalues) / len( | |
| selected_features)) >= 0.5: # srishti - changed just for testing, revert later VALID MODEL CRITERIA | |
| # predicted_values = model.predict(X_train) | |
| mape = mean_absolute_percentage_error(y_train, predicted_values) | |
| adjr2 = model.rsquared_adj | |
| r2 = model.rsquared | |
| filename = os.path.join(save_path, f"model_{i}.pkl") | |
| with open(filename, "wb") as f: | |
| pickle.dump(model, f) | |
| # with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file: | |
| # model = pickle.load(file) | |
| st.session_state['Model_results']['Model_object'].append(filename) | |
| st.session_state['Model_results']['Model_iteration'].append(i) | |
| st.session_state['Model_results']['Feature_set'].append(fet) | |
| st.session_state['Model_results']['MAPE'].append(mape) | |
| st.session_state['Model_results']['R2'].append(r2) | |
| st.session_state['Model_results']['ADJR2'].append(adjr2) | |
| st.session_state['Model_results']['pos_count'].append(len(model_positive)) | |
| current_time = time.time() | |
| time_taken = current_time - start_time | |
| time_elapsed_minutes = time_taken / 60 | |
| completed_iterations_text = f"{i + 1}/{iterations}" | |
| progress_bar.progress((i + 1) / int(iterations)) | |
| progress_text.text( | |
| f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}') | |
| st.write( | |
| f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models') | |
| pd.DataFrame(st.session_state['Model_results']).to_csv('model_output.csv') | |
| def to_percentage(value): | |
| return f'{value * 100:.1f}%' | |
| ## Section 5 - Select Model | |
| st.title('2. Select Models') | |
| show_results_defualt = st.session_state["project_dct"]["model_build"]["show_results_check"] if st.session_state["project_dct"]["model_build"]['show_results_check'] is not None else False | |
| if 'tick' not in st.session_state: | |
| st.session_state['tick'] = False | |
| if st.checkbox('Show results of top 10 models (based on MAPE and Adj. R2)', value=show_results_defualt): | |
| st.session_state["project_dct"]["model_build"]["show_results_check"] = True | |
| st.session_state['tick'] = True | |
| st.write('Select one model iteration to generate performance metrics for it:') | |
| data = pd.DataFrame(st.session_state['Model_results']) | |
| data = data[data['pos_count']==data['pos_count'].max()].reset_index(drop=True) # Sprint4 -- Srishti -- only show models with the lowest num of neg coeffs | |
| data.sort_values(by=['ADJR2'], ascending=False, inplace=True) | |
| data.drop_duplicates(subset='Model_iteration', inplace=True) | |
| top_10 = data.head(10) | |
| top_10['Rank'] = np.arange(1, len(top_10) + 1, 1) | |
| top_10[['MAPE', 'R2', 'ADJR2']] = np.round(top_10[['MAPE', 'R2', 'ADJR2']], 4).applymap(to_percentage) | |
| top_10_table = top_10[['Rank', 'Model_iteration', 'MAPE', 'ADJR2', 'R2']] | |
| # top_10_table.columns=[['Rank','Model Iteration Index','MAPE','Adjusted R2','R2']] | |
| gd = GridOptionsBuilder.from_dataframe(top_10_table) | |
| gd.configure_pagination(enabled=True) | |
| gd.configure_selection( | |
| use_checkbox=True, | |
| selection_mode="single", | |
| pre_select_all_rows=False, | |
| pre_selected_rows=[1], | |
| ) | |
| gridoptions = gd.build() | |
| table = AgGrid(top_10, gridOptions=gridoptions, update_mode=GridUpdateMode.SELECTION_CHANGED) | |
| selected_rows = table.selected_rows | |
| # if st.session_state["selected_rows"] != selected_rows: | |
| # st.session_state["build_rc_cb"] = False | |
| st.session_state["selected_rows"] = selected_rows | |
| # Section 6 - Display Results | |
| if len(selected_rows) > 0: | |
| st.header('2.1 Results Summary') | |
| model_object = data[data['Model_iteration'] == selected_rows[0]['Model_iteration']]['Model_object'] | |
| features_set = data[data['Model_iteration'] == selected_rows[0]['Model_iteration']]['Feature_set'] | |
| with open(str(model_object.values[0]), 'rb') as file: | |
| # print(file) | |
| model = pickle.load(file) | |
| st.write(model.summary()) | |
| st.header('2.2 Actual vs. Predicted Plot') | |
| if is_panel : | |
| df = st.session_state['media_data'] | |
| X = df[features_set.values[0]] | |
| y = df[target_col] | |
| ss = MinMaxScaler() | |
| X = pd.DataFrame(ss.fit_transform(X), columns=X.columns) | |
| # Sprint2 changes | |
| X[target_col] = y # new | |
| X[panel_col] = df[panel_col] | |
| X[date_col] = date | |
| X_train = X.iloc[:8000] | |
| X_test = X.iloc[8000:].reset_index(drop=True) | |
| y_train = y.iloc[:8000] | |
| y_test = y.iloc[8000:].reset_index(drop=True) | |
| test_spends = spends_data[8000:] # Sprint3 - test spends for resp curves | |
| random_eff_df = get_random_effects(media_data, panel_col, model) | |
| train_pred = model.fittedvalues | |
| test_pred = mdf_predict(X_test, model, random_eff_df) | |
| print("__" * 20, test_pred.isna().sum()) | |
| else : | |
| df = st.session_state['media_data'] | |
| X = df[features_set.values[0]] | |
| y = df[target_col] | |
| ss = MinMaxScaler() | |
| X = pd.DataFrame(ss.fit_transform(X), columns=X.columns) | |
| X = sm.add_constant(X) | |
| X[date_col] = date | |
| X_train = X.iloc[:130] | |
| X_test = X.iloc[130:].reset_index(drop=True) | |
| y_train = y.iloc[:130] | |
| y_test = y.iloc[130:].reset_index(drop=True) | |
| test_spends = spends_data[130:] # Sprint3 - test spends for resp curves | |
| train_pred = model.predict(X_train[features_set.values[0]+['const']]) | |
| test_pred = model.predict(X_test[features_set.values[0]+['const']]) | |
| # save x test to test - srishti | |
| # x_test_to_save = X_test.copy() | |
| # x_test_to_save['Actuals'] = y_test | |
| # x_test_to_save['Predictions'] = test_pred | |
| # | |
| # x_train_to_save = X_train.copy() | |
| # x_train_to_save['Actuals'] = y_train | |
| # x_train_to_save['Predictions'] = train_pred | |
| # | |
| # x_train_to_save.to_csv('Test/x_train_to_save.csv', index=False) | |
| # x_test_to_save.to_csv('Test/x_test_to_save.csv', index=False) | |
| st.session_state['X'] = X_train | |
| st.session_state['features_set'] = features_set.values[0] | |
| print("**" * 20, "selected model features : ", features_set.values[0]) | |
| metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train[date_col], y_train, train_pred, | |
| model, target_column=sel_target_col, | |
| is_panel=is_panel) # Sprint2 | |
| st.plotly_chart(actual_vs_predicted_plot, use_container_width=True) | |
| st.markdown('## 2.3 Residual Analysis') | |
| columns = st.columns(2) | |
| with columns[0]: | |
| fig = plot_residual_predicted(y_train, train_pred, X_train) # Sprint2 | |
| st.plotly_chart(fig) | |
| with columns[1]: | |
| st.empty() | |
| fig = qqplot(y_train, train_pred) # Sprint2 | |
| st.plotly_chart(fig) | |
| with columns[0]: | |
| fig = residual_distribution(y_train, train_pred) # Sprint2 | |
| st.pyplot(fig) | |
| vif_data = pd.DataFrame() | |
| # X=X.drop('const',axis=1) | |
| X_train_orig = X_train.copy() # Sprint2 -- creating a copy of xtrain. Later deleting panel, target & date from xtrain | |
| del_col_list = list(set([target_col, panel_col, date_col]).intersection(set(X_train.columns))) | |
| X_train.drop(columns=del_col_list, inplace=True) # Sprint2 | |
| vif_data["Variable"] = X_train.columns | |
| vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])] | |
| vif_data.sort_values(by=['VIF'], ascending=False, inplace=True) | |
| vif_data = np.round(vif_data) | |
| vif_data['VIF'] = vif_data['VIF'].astype(float) | |
| st.header('2.4 Variance Inflation Factor (VIF)') | |
| # st.dataframe(vif_data) | |
| color_mapping = { | |
| 'darkgreen': (vif_data['VIF'] < 3), | |
| 'orange': (vif_data['VIF'] >= 3) & (vif_data['VIF'] <= 10), | |
| 'darkred': (vif_data['VIF'] > 10) | |
| } | |
| # Create a horizontal bar plot | |
| fig, ax = plt.subplots() | |
| fig.set_figwidth(10) # Adjust the width of the figure as needed | |
| # Sort the bars by descending VIF values | |
| vif_data = vif_data.sort_values(by='VIF', ascending=False) | |
| # Iterate through the color mapping and plot bars with corresponding colors | |
| for color, condition in color_mapping.items(): | |
| subset = vif_data[condition] | |
| bars = ax.barh(subset["Variable"], subset["VIF"], color=color, label=color) | |
| # Add text annotations on top of the bars | |
| for bar in bars: | |
| width = bar.get_width() | |
| ax.annotate(f'{width:}', xy=(width, bar.get_y() + bar.get_height() / 2), xytext=(5, 0), | |
| textcoords='offset points', va='center') | |
| # Customize the plot | |
| ax.set_xlabel('VIF Values') | |
| # ax.set_title('2.4 Variance Inflation Factor (VIF)') | |
| # ax.legend(loc='upper right') | |
| # Display the plot in Streamlit | |
| st.pyplot(fig) | |
| with st.expander('Results Summary Test data'): | |
| # ss = MinMaxScaler() | |
| # X_test = pd.DataFrame(ss.fit_transform(X_test), columns=X_test.columns) | |
| st.header('2.2 Actual vs. Predicted Plot') | |
| metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_test[date_col], y_test, | |
| test_pred, model, | |
| target_column=sel_target_col, | |
| is_panel=is_panel) # Sprint2 | |
| st.plotly_chart(actual_vs_predicted_plot, use_container_width=True) | |
| st.markdown('## 2.3 Residual Analysis') | |
| columns = st.columns(2) | |
| with columns[0]: | |
| fig = plot_residual_predicted(y, test_pred, X_test) # Sprint2 | |
| st.plotly_chart(fig) | |
| with columns[1]: | |
| st.empty() | |
| fig = qqplot(y, test_pred) # Sprint2 | |
| st.plotly_chart(fig) | |
| with columns[0]: | |
| fig = residual_distribution(y, test_pred) # Sprint2 | |
| st.pyplot(fig) | |
| value = False | |
| save_button_model = st.checkbox('Save this model to tune', key='build_rc_cb') # , on_click=set_save()) | |
| if save_button_model: | |
| mod_name = st.text_input('Enter model name') | |
| if len(mod_name) > 0: | |
| mod_name = mod_name + "__" + target_col # Sprint4 - adding target col to model name | |
| if is_panel : | |
| pred_train= model.fittedvalues | |
| pred_test= mdf_predict(X_test, model, random_eff_df) | |
| else : | |
| st.session_state['features_set'] = st.session_state['features_set'] + ['const'] | |
| pred_train= model.predict(X_train_orig[st.session_state['features_set']]) | |
| pred_test= model.predict(X_test[st.session_state['features_set']]) | |
| st.session_state['Model'][mod_name] = {"Model_object": model, | |
| 'feature_set': st.session_state['features_set'], | |
| 'X_train': X_train_orig, | |
| 'X_test': X_test, | |
| 'y_train': y_train, | |
| 'y_test': y_test, | |
| 'pred_train':pred_train, | |
| 'pred_test': pred_test | |
| } | |
| st.session_state['X_train'] = X_train_orig | |
| st.session_state['X_test_spends'] = test_spends | |
| st.session_state['saved_model_names'].append(mod_name) | |
| # Sprint3 additions | |
| if is_panel : | |
| random_eff_df = get_random_effects(media_data, panel_col, model) | |
| st.session_state['random_effects'] = random_eff_df | |
| with open("best_models.pkl", "wb") as f: | |
| pickle.dump(st.session_state['Model'], f) | |
| st.success(mod_name + ' model saved! Proceed to the next page to tune the model') | |
| urm = st.session_state['used_response_metrics'] | |
| urm.append(sel_target_col) | |
| st.session_state['used_response_metrics'] = list(set(urm)) | |
| mod_name = "" | |
| # Sprint4 - add the formatted name of the target col to used resp metrics | |
| value = False | |
| st.session_state["project_dct"]["model_build"]["session_state_saved"] = {} | |
| for key in ['Model', 'bin_dict', 'used_response_metrics', 'date', 'saved_model_names', 'media_data', 'X_test_spends']: | |
| st.session_state["project_dct"]["model_build"]["session_state_saved"][key] = st.session_state[key] | |
| project_dct_path = os.path.join(st.session_state['project_path'], "project_dct.pkl") | |
| with open(project_dct_path, 'wb') as f: | |
| pickle.dump(st.session_state["project_dct"], f) | |
| st.toast("💾 Saved Successfully!") | |
| else : | |
| st.session_state["project_dct"]["model_build"]["show_results_check"] = False |