Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	| import plotly.express as px | |
| import numpy as np | |
| import plotly.graph_objects as go | |
| import streamlit as st | |
| import pandas as pd | |
| import statsmodels.api as sm | |
| from sklearn.metrics import mean_absolute_percentage_error | |
| import sys | |
| import os | |
| from utilities import set_header, load_local_css, load_authenticator | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| import sweetviz as sv | |
| import tempfile | |
| from sklearn.preprocessing import MinMaxScaler | |
| from st_aggrid import AgGrid | |
| from st_aggrid import GridOptionsBuilder, GridUpdateMode | |
| from st_aggrid import GridOptionsBuilder | |
| import sys | |
| import re | |
| sys.setrecursionlimit(10**6) | |
| original_stdout = sys.stdout | |
| sys.stdout = open("temp_stdout.txt", "w") | |
| sys.stdout.close() | |
| sys.stdout = original_stdout | |
| st.set_page_config(layout="wide") | |
| load_local_css("styles.css") | |
| set_header() | |
| for k, v in st.session_state.items(): | |
| if k not in ["logout", "login", "config"] and not k.startswith("FormSubmitter"): | |
| st.session_state[k] = v | |
| authenticator = st.session_state.get("authenticator") | |
| if authenticator is None: | |
| authenticator = load_authenticator() | |
| name, authentication_status, username = authenticator.login("Login", "main") | |
| auth_status = st.session_state.get("authentication_status") | |
| if auth_status == True: | |
| is_state_initiaized = st.session_state.get("initialized", False) | |
| if not is_state_initiaized: | |
| a = 1 | |
| def plot_residual_predicted(actual, predicted, df_): | |
| df_["Residuals"] = actual - pd.Series(predicted) | |
| df_["StdResidual"] = (df_["Residuals"] - df_["Residuals"].mean()) / df_[ | |
| "Residuals" | |
| ].std() | |
| # Create a Plotly scatter plot | |
| fig = px.scatter( | |
| df_, | |
| x=predicted, | |
| y="StdResidual", | |
| opacity=0.5, | |
| color_discrete_sequence=["#11B6BD"], | |
| ) | |
| # Add horizontal lines | |
| fig.add_hline(y=0, line_dash="dash", line_color="darkorange") | |
| fig.add_hline(y=2, line_color="red") | |
| fig.add_hline(y=-2, line_color="red") | |
| fig.update_xaxes(title="Predicted") | |
| fig.update_yaxes(title="Standardized Residuals (Actual - Predicted)") | |
| # Set the same width and height for both figures | |
| fig.update_layout( | |
| title="Residuals over Predicted Values", | |
| autosize=False, | |
| width=600, | |
| height=400, | |
| ) | |
| return fig | |
| def residual_distribution(actual, predicted): | |
| Residuals = actual - pd.Series(predicted) | |
| # Create a Seaborn distribution plot | |
| sns.set(style="whitegrid") | |
| plt.figure(figsize=(6, 4)) | |
| sns.histplot(Residuals, kde=True, color="#11B6BD") | |
| plt.title(" Distribution of Residuals") | |
| plt.xlabel("Residuals") | |
| plt.ylabel("Probability Density") | |
| return plt | |
| def qqplot(actual, predicted): | |
| Residuals = actual - pd.Series(predicted) | |
| Residuals = pd.Series(Residuals) | |
| Resud_std = (Residuals - Residuals.mean()) / Residuals.std() | |
| # Create a QQ plot using Plotly with custom colors | |
| fig = go.Figure() | |
| fig.add_trace( | |
| go.Scatter( | |
| x=sm.ProbPlot(Resud_std).theoretical_quantiles, | |
| y=sm.ProbPlot(Resud_std).sample_quantiles, | |
| mode="markers", | |
| marker=dict(size=5, color="#11B6BD"), | |
| name="QQ Plot", | |
| ) | |
| ) | |
| # Add the 45-degree reference line | |
| diagonal_line = go.Scatter( | |
| x=[-2, 2], # Adjust the x values as needed to fit the range of your data | |
| y=[-2, 2], # Adjust the y values accordingly | |
| mode="lines", | |
| line=dict(color="red"), # Customize the line color and style | |
| name=" ", | |
| ) | |
| fig.add_trace(diagonal_line) | |
| # Customize the layout | |
| fig.update_layout( | |
| title="QQ Plot of Residuals", | |
| title_x=0.5, | |
| autosize=False, | |
| width=600, | |
| height=400, | |
| xaxis_title="Theoretical Quantiles", | |
| yaxis_title="Sample Quantiles", | |
| ) | |
| return fig | |
| def plot_actual_vs_predicted(date, y, predicted_values, model): | |
| fig = go.Figure() | |
| fig.add_trace( | |
| go.Scatter( | |
| x=date, y=y, mode="lines", name="Actual", line=dict(color="blue") | |
| ) | |
| ) | |
| fig.add_trace( | |
| go.Scatter( | |
| x=date, | |
| y=predicted_values, | |
| mode="lines", | |
| name="Predicted", | |
| line=dict(color="orange"), | |
| ) | |
| ) | |
| # Calculate MAPE | |
| mape = mean_absolute_percentage_error(y, predicted_values) * 100 | |
| # Calculate R-squared | |
| rss = np.sum((y - predicted_values) ** 2) | |
| tss = np.sum((y - np.mean(y)) ** 2) | |
| r_squared = 1 - (rss / tss) | |
| # Get the number of predictors | |
| num_predictors = model.df_model | |
| # Get the number of samples | |
| num_samples = len(y) | |
| # Calculate Adjusted R-squared | |
| adj_r_squared = 1 - ( | |
| (1 - r_squared) * ((num_samples - 1) / (num_samples - num_predictors - 1)) | |
| ) | |
| metrics_table = pd.DataFrame( | |
| { | |
| "Metric": ["MAPE", "R-squared", "AdjR-squared"], | |
| "Value": [mape, r_squared, adj_r_squared], | |
| } | |
| ) | |
| fig.update_layout( | |
| xaxis=dict(title="Date"), | |
| yaxis=dict(title="Value"), | |
| title=f"MAPE : {mape:.2f}%, AdjR2: {adj_r_squared:.2f}", | |
| xaxis_tickangle=-30, | |
| ) | |
| return metrics_table, fig | |
| def contributions(X, model): | |
| X1 = X.copy() | |
| for j, col in enumerate(X1.columns): | |
| X1[col] = X1[col] * model.params.values[j] | |
| return np.round( | |
| (X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2 | |
| ) | |
| transformed_data = pd.read_csv("transformed_data.csv") | |
| # hard coded for now, need to get features set from model | |
| feature_set_dct = { | |
| "app_installs_-_appsflyer": [ | |
| "paid_search_clicks", | |
| "fb:_level_achieved_-_tier_1_impressions_lag2", | |
| "fb:_level_achieved_-_tier_2_clicks_lag2", | |
| "paid_social_others_impressions_adst.1", | |
| "ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag2", | |
| "digital_tactic_others_clicks", | |
| "kwai_clicks_adst.3", | |
| "programmaticclicks", | |
| "indicacao_clicks_adst.1", | |
| "infleux_clicks_adst.4", | |
| "influencer_clicks", | |
| ], | |
| "account_requests_-_appsflyer": [ | |
| "paid_search_impressions", | |
| "fb:_level_achieved_-_tier_1_clicks_adst.1", | |
| "fb:_level_achieved_-_tier_2_clicks_adst.1", | |
| "paid_social_others_clicks_lag2", | |
| "ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag5_adst.1", | |
| "digital_tactic_others_clicks_adst.1", | |
| "kwai_clicks_adst.2", | |
| "programmaticimpressions_lag4_adst.1", | |
| "indicacao_clicks", | |
| "infleux_clicks_adst.2", | |
| "influencer_clicks", | |
| ], | |
| "total_approved_accounts_-_appsflyer": [ | |
| "paid_search_clicks", | |
| "fb:_level_achieved_-_tier_1_impressions_lag2_adst.1", | |
| "fb:_level_achieved_-_tier_2_impressions_lag2", | |
| "paid_social_others_clicks_lag2_adst.2", | |
| "ga_app:_will_and_cid_pequena_baixo_risco_impressions_lag4", | |
| "digital_tactic_others_clicks", | |
| "kwai_impressions_adst.2", | |
| "programmaticclicks_adst.5", | |
| "indicacao_clicks_adst.1", | |
| "infleux_clicks_adst.3", | |
| "influencer_clicks", | |
| ], | |
| "total_approved_accounts_-_revenue": [ | |
| "paid_search_impressions_adst.5", | |
| "kwai_impressions_lag2_adst.3", | |
| "indicacao_clicks_adst.3", | |
| "infleux_clicks_adst.3", | |
| "programmaticclicks_adst.4", | |
| "influencer_clicks_adst.3", | |
| "fb:_level_achieved_-_tier_1_impressions_adst.2", | |
| "fb:_level_achieved_-_tier_2_impressions_lag3_adst.5", | |
| "paid_social_others_impressions_adst.3", | |
| "ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag3_adst.5", | |
| "digital_tactic_others_clicks_adst.2", | |
| ], | |
| } | |
| # """ the above part should be modified so that we are fetching features set from the saved model""" | |
| def contributions(X, model, target): | |
| X1 = X.copy() | |
| for j, col in enumerate(X1.columns): | |
| X1[col] = X1[col] * model.params.values[j] | |
| contributions = np.round( | |
| (X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2 | |
| ) | |
| contributions = ( | |
| pd.DataFrame(contributions, columns=target) | |
| .reset_index() | |
| .rename(columns={"index": "Channel"}) | |
| ) | |
| contributions["Channel"] = [ | |
| re.split(r"_imp|_cli", col)[0] for col in contributions["Channel"] | |
| ] | |
| return contributions | |
| def model_fit(features_set, target): | |
| X = transformed_data[features_set] | |
| y = transformed_data[target] | |
| ss = MinMaxScaler() | |
| X = pd.DataFrame(ss.fit_transform(X), columns=X.columns) | |
| X = sm.add_constant(X) | |
| X_train = X.iloc[:150] | |
| X_test = X.iloc[150:] | |
| y_train = y.iloc[:150] | |
| y_test = y.iloc[150:] | |
| model = sm.OLS(y_train, X_train).fit() | |
| predicted_values_train = model.predict(X_train) | |
| r2 = model.rsquared | |
| adjr2 = model.rsquared_adj | |
| train_mape = mean_absolute_percentage_error(y_train, predicted_values_train) | |
| test_mape = mean_absolute_percentage_error(y_test, model.predict(X_test)) | |
| summary = model.summary() | |
| train_contributions = contributions(X_train, model, [target]) | |
| return ( | |
| pd.DataFrame( | |
| { | |
| "Model": target, | |
| "R2": np.round(r2, 2), | |
| "ADJr2": np.round(adjr2, 2), | |
| "Train Mape": np.round(train_mape, 2), | |
| "Test Mape": np.round(test_mape, 2), | |
| "Summary": summary, | |
| "Model_object": model, | |
| }, | |
| index=[0], | |
| ), | |
| train_contributions, | |
| ) | |
| metrics_table = pd.DataFrame() | |
| if "contribution_df" not in st.session_state: | |
| st.session_state["contribution_df"] = pd.DataFrame() | |
| for target, feature_set in feature_set_dct.items(): | |
| metrics_table = pd.concat( | |
| [metrics_table, model_fit(features_set=feature_set, target=target)[0]] | |
| ) | |
| if st.session_state["contribution_df"].empty: | |
| st.session_state["contribution_df"] = model_fit( | |
| features_set=feature_set, target=target | |
| )[1] | |
| else: | |
| st.session_state["contribution_df"] = pd.merge( | |
| st.session_state["contribution_df"], | |
| model_fit(features_set=feature_set, target=target)[1], | |
| ) | |
| # st.write(st.session_state["contribution_df"]) | |
| metrics_table.reset_index(drop=True, inplace=True) | |
| eda_columns = st.columns(2) | |
| with eda_columns[1]: | |
| eda = st.button( | |
| "Generate EDA Report", | |
| help="Click to generate a bivariate report for the selected response metric from the table below.", | |
| ) | |
| # st.markdown('Model Metrics') | |
| st.title("Contribution Overview") | |
| contribution_selections = st.multiselect( | |
| "Select the models to compare contributions", | |
| [ | |
| col | |
| for col in st.session_state["contribution_df"].columns | |
| if col.lower() != "channel" | |
| ], | |
| default=[ | |
| col | |
| for col in st.session_state["contribution_df"].columns | |
| if col.lower() != "channel" | |
| ][-1], | |
| ) | |
| trace_data = [] | |
| for selection in contribution_selections: | |
| trace = go.Bar( | |
| x=st.session_state["contribution_df"]["Channel"], | |
| y=st.session_state["contribution_df"][selection], | |
| name=selection, | |
| text=np.round(st.session_state["contribution_df"][selection], 0) | |
| .astype(int) | |
| .astype(str) | |
| + "%", | |
| textposition="outside", | |
| ) | |
| trace_data.append(trace) | |
| layout = go.Layout( | |
| title="Metrics Contribution by Channel", | |
| xaxis=dict(title="Channel Name"), | |
| yaxis=dict(title="Metrics Contribution"), | |
| barmode="group", | |
| ) | |
| fig = go.Figure(data=trace_data, layout=layout) | |
| st.plotly_chart(fig, use_container_width=True) | |
| ############################################ Waterfall Chart ############################################ | |
| # import plotly.graph_objects as go | |
| # # Initialize a Plotly figure | |
| # fig = go.Figure() | |
| # for selection in contribution_selections: | |
| # # Ensure y_values are numeric | |
| # y_values = st.session_state["contribution_df"][selection].values.astype(float) | |
| # # Generating text labels for each bar, ensuring operations are compatible with string formats | |
| # text_values = [f"{val}%" for val in np.round(y_values, 0).astype(int)] | |
| # fig.add_trace( | |
| # go.Waterfall( | |
| # name=selection, | |
| # orientation="v", | |
| # measure=["relative"] | |
| # * len(y_values), # Adjust if you have absolute values at certain points | |
| # x=st.session_state["contribution_df"]["Channel"].tolist(), | |
| # text=text_values, | |
| # textposition="outside", | |
| # y=y_values, | |
| # increasing={"marker": {"color": "green"}}, | |
| # decreasing={"marker": {"color": "red"}}, | |
| # totals={"marker": {"color": "blue"}}, | |
| # ) | |
| # ) | |
| # fig.update_layout( | |
| # title="Metrics Contribution by Channel", | |
| # xaxis={"title": "Channel Name"}, | |
| # yaxis={"title": "Metrics Contribution"}, | |
| # height=600, | |
| # ) | |
| # # Displaying the waterfall chart in Streamlit | |
| # st.plotly_chart(fig, use_container_width=True) | |
| import plotly.graph_objects as go | |
| # Initialize a Plotly figure | |
| fig = go.Figure() | |
| for selection in contribution_selections: | |
| # Ensure contributions are numeric | |
| contributions = ( | |
| st.session_state["contribution_df"][selection].values.astype(float).tolist() | |
| ) | |
| channel_names = st.session_state["contribution_df"]["Channel"].tolist() | |
| display_name, display_contribution, base_contribution = [], [], 0 | |
| for channel_name, contribution in zip(channel_names, contributions): | |
| if channel_name != "const": | |
| display_name.append(channel_name) | |
| display_contribution.append(contribution) | |
| else: | |
| base_contribution = contribution | |
| display_name = ["Base Sales"] + display_name | |
| display_contribution = [base_contribution] + display_contribution | |
| # Generating text labels for each bar, ensuring operations are compatible with string formats | |
| text_values = [ | |
| f"{val}%" for val in np.round(display_contribution, 0).astype(int) | |
| ] | |
| fig.add_trace( | |
| go.Waterfall( | |
| orientation="v", | |
| measure=["relative"] | |
| * len( | |
| display_contribution | |
| ), # Adjust if you have absolute values at certain points | |
| x=display_name, | |
| text=text_values, | |
| textposition="outside", | |
| y=display_contribution, | |
| increasing={"marker": {"color": "green"}}, | |
| decreasing={"marker": {"color": "red"}}, | |
| totals={"marker": {"color": "blue"}}, | |
| ) | |
| ) | |
| fig.update_layout( | |
| title="Metrics Contribution by Channel", | |
| xaxis={"title": "Channel Name"}, | |
| yaxis={"title": "Metrics Contribution"}, | |
| height=600, | |
| ) | |
| # Displaying the waterfall chart in Streamlit | |
| st.plotly_chart(fig, use_container_width=True) | |
| ############################################ Waterfall Chart ############################################ | |
| st.title("Analysis of Models Result") | |
| # st.markdown() | |
| gd_table = metrics_table.iloc[:, :-2] | |
| gd = GridOptionsBuilder.from_dataframe(gd_table) | |
| # gd.configure_pagination(enabled=True) | |
| gd.configure_selection( | |
| use_checkbox=True, | |
| selection_mode="single", | |
| pre_select_all_rows=False, | |
| pre_selected_rows=[1], | |
| ) | |
| gridoptions = gd.build() | |
| table = AgGrid( | |
| gd_table, gridOptions=gridoptions, fit_columns_on_grid_load=True, height=200 | |
| ) | |
| # table=metrics_table.iloc[:,:-2] | |
| # table.insert(0, "Select", False) | |
| # selection_table=st.data_editor(table,column_config={"Select": st.column_config.CheckboxColumn(required=True)}) | |
| if len(table.selected_rows) == 0: | |
| st.warning( | |
| "Click on the checkbox to view comprehensive results of the selected model." | |
| ) | |
| st.stop() | |
| else: | |
| target_column = table.selected_rows[0]["Model"] | |
| feature_set = feature_set_dct[target_column] | |
| with eda_columns[1]: | |
| if eda: | |
| def generate_report_with_target(channel_data, target_feature): | |
| report = sv.analyze( | |
| [channel_data, "Dataset"], target_feat=target_feature, verbose=False | |
| ) | |
| temp_dir = tempfile.mkdtemp() | |
| report_path = os.path.join(temp_dir, "report.html") | |
| report.show_html( | |
| filepath=report_path, open_browser=False | |
| ) # Generate the report as an HTML file | |
| return report_path | |
| report_data = transformed_data[feature_set] | |
| report_data[target_column] = transformed_data[target_column] | |
| report_file = generate_report_with_target(report_data, target_column) | |
| if os.path.exists(report_file): | |
| with open(report_file, "rb") as f: | |
| st.download_button( | |
| label="Download EDA Report", | |
| data=f.read(), | |
| file_name="report.html", | |
| mime="text/html", | |
| ) | |
| else: | |
| st.warning("Report generation failed. Unable to find the report file.") | |
| model = metrics_table[metrics_table["Model"] == target_column]["Model_object"].iloc[ | |
| 0 | |
| ] | |
| st.header("Model Summary") | |
| st.write(model.summary()) | |
| X = transformed_data[feature_set] | |
| ss = MinMaxScaler() | |
| X = pd.DataFrame(ss.fit_transform(X), columns=X.columns) | |
| X = sm.add_constant(X) | |
| y = transformed_data[target_column] | |
| X_train = X.iloc[:150] | |
| X_test = X.iloc[150:] | |
| y_train = y.iloc[:150] | |
| y_test = y.iloc[150:] | |
| X.index = transformed_data["date"] | |
| y.index = transformed_data["date"] | |
| metrics_table_train, fig_train = plot_actual_vs_predicted( | |
| X_train.index, y_train, model.predict(X_train), model | |
| ) | |
| metrics_table_test, fig_test = plot_actual_vs_predicted( | |
| X_test.index, y_test, model.predict(X_test), model | |
| ) | |
| metrics_table_train = metrics_table_train.set_index("Metric").transpose() | |
| metrics_table_train.index = ["Train"] | |
| metrics_table_test = metrics_table_test.set_index("Metric").transpose() | |
| metrics_table_test.index = ["test"] | |
| metrics_table = np.round(pd.concat([metrics_table_train, metrics_table_test]), 2) | |
| st.markdown("Result Overview") | |
| st.dataframe(np.round(metrics_table, 2), use_container_width=True) | |
| st.subheader("Actual vs Predicted Plot Train") | |
| st.plotly_chart(fig_train, use_container_width=True) | |
| st.subheader("Actual vs Predicted Plot Test") | |
| st.plotly_chart(fig_test, use_container_width=True) | |
| st.markdown("## Residual Analysis") | |
| columns = st.columns(2) | |
| Xtrain1 = X_train.copy() | |
| with columns[0]: | |
| fig = plot_residual_predicted(y_train, model.predict(Xtrain1), Xtrain1) | |
| st.plotly_chart(fig) | |
| with columns[1]: | |
| st.empty() | |
| fig = qqplot(y_train, model.predict(X_train)) | |
| st.plotly_chart(fig) | |
| with columns[0]: | |
| fig = residual_distribution(y_train, model.predict(X_train)) | |
| st.pyplot(fig) | |
| elif auth_status == False: | |
| st.error("Username/Password is incorrect") | |
| try: | |
| username_forgot_pw, email_forgot_password, random_password = ( | |
| authenticator.forgot_password("Forgot password") | |
| ) | |
| if username_forgot_pw: | |
| st.success("New password sent securely") | |
| # Random password to be transferred to the user securely | |
| elif username_forgot_pw == False: | |
| st.error("Username not found") | |
| except Exception as e: | |
| st.error(e) | |