Spaces:
Sleeping
Sleeping
import plotly.express as px | |
import numpy as np | |
import plotly.graph_objects as go | |
import streamlit as st | |
import pandas as pd | |
import statsmodels.api as sm | |
from sklearn.metrics import mean_absolute_percentage_error | |
import sys | |
import os | |
from utilities import set_header, load_local_css, load_authenticator | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import sweetviz as sv | |
import tempfile | |
from sklearn.preprocessing import MinMaxScaler | |
from st_aggrid import AgGrid | |
from st_aggrid import GridOptionsBuilder, GridUpdateMode | |
from st_aggrid import GridOptionsBuilder | |
import sys | |
import re | |
sys.setrecursionlimit(10**6) | |
original_stdout = sys.stdout | |
sys.stdout = open("temp_stdout.txt", "w") | |
sys.stdout.close() | |
sys.stdout = original_stdout | |
st.set_page_config(layout="wide") | |
load_local_css("styles.css") | |
set_header() | |
for k, v in st.session_state.items(): | |
if k not in ["logout", "login", "config"] and not k.startswith("FormSubmitter"): | |
st.session_state[k] = v | |
authenticator = st.session_state.get("authenticator") | |
if authenticator is None: | |
authenticator = load_authenticator() | |
name, authentication_status, username = authenticator.login("Login", "main") | |
auth_status = st.session_state.get("authentication_status") | |
if auth_status == True: | |
is_state_initiaized = st.session_state.get("initialized", False) | |
if not is_state_initiaized: | |
a = 1 | |
def plot_residual_predicted(actual, predicted, df_): | |
df_["Residuals"] = actual - pd.Series(predicted) | |
df_["StdResidual"] = (df_["Residuals"] - df_["Residuals"].mean()) / df_[ | |
"Residuals" | |
].std() | |
# Create a Plotly scatter plot | |
fig = px.scatter( | |
df_, | |
x=predicted, | |
y="StdResidual", | |
opacity=0.5, | |
color_discrete_sequence=["#11B6BD"], | |
) | |
# Add horizontal lines | |
fig.add_hline(y=0, line_dash="dash", line_color="darkorange") | |
fig.add_hline(y=2, line_color="red") | |
fig.add_hline(y=-2, line_color="red") | |
fig.update_xaxes(title="Predicted") | |
fig.update_yaxes(title="Standardized Residuals (Actual - Predicted)") | |
# Set the same width and height for both figures | |
fig.update_layout( | |
title="Residuals over Predicted Values", | |
autosize=False, | |
width=600, | |
height=400, | |
) | |
return fig | |
def residual_distribution(actual, predicted): | |
Residuals = actual - pd.Series(predicted) | |
# Create a Seaborn distribution plot | |
sns.set(style="whitegrid") | |
plt.figure(figsize=(6, 4)) | |
sns.histplot(Residuals, kde=True, color="#11B6BD") | |
plt.title(" Distribution of Residuals") | |
plt.xlabel("Residuals") | |
plt.ylabel("Probability Density") | |
return plt | |
def qqplot(actual, predicted): | |
Residuals = actual - pd.Series(predicted) | |
Residuals = pd.Series(Residuals) | |
Resud_std = (Residuals - Residuals.mean()) / Residuals.std() | |
# Create a QQ plot using Plotly with custom colors | |
fig = go.Figure() | |
fig.add_trace( | |
go.Scatter( | |
x=sm.ProbPlot(Resud_std).theoretical_quantiles, | |
y=sm.ProbPlot(Resud_std).sample_quantiles, | |
mode="markers", | |
marker=dict(size=5, color="#11B6BD"), | |
name="QQ Plot", | |
) | |
) | |
# Add the 45-degree reference line | |
diagonal_line = go.Scatter( | |
x=[-2, 2], # Adjust the x values as needed to fit the range of your data | |
y=[-2, 2], # Adjust the y values accordingly | |
mode="lines", | |
line=dict(color="red"), # Customize the line color and style | |
name=" ", | |
) | |
fig.add_trace(diagonal_line) | |
# Customize the layout | |
fig.update_layout( | |
title="QQ Plot of Residuals", | |
title_x=0.5, | |
autosize=False, | |
width=600, | |
height=400, | |
xaxis_title="Theoretical Quantiles", | |
yaxis_title="Sample Quantiles", | |
) | |
return fig | |
def plot_actual_vs_predicted(date, y, predicted_values, model): | |
fig = go.Figure() | |
fig.add_trace( | |
go.Scatter( | |
x=date, y=y, mode="lines", name="Actual", line=dict(color="blue") | |
) | |
) | |
fig.add_trace( | |
go.Scatter( | |
x=date, | |
y=predicted_values, | |
mode="lines", | |
name="Predicted", | |
line=dict(color="orange"), | |
) | |
) | |
# Calculate MAPE | |
mape = mean_absolute_percentage_error(y, predicted_values) * 100 | |
# Calculate R-squared | |
rss = np.sum((y - predicted_values) ** 2) | |
tss = np.sum((y - np.mean(y)) ** 2) | |
r_squared = 1 - (rss / tss) | |
# Get the number of predictors | |
num_predictors = model.df_model | |
# Get the number of samples | |
num_samples = len(y) | |
# Calculate Adjusted R-squared | |
adj_r_squared = 1 - ( | |
(1 - r_squared) * ((num_samples - 1) / (num_samples - num_predictors - 1)) | |
) | |
metrics_table = pd.DataFrame( | |
{ | |
"Metric": ["MAPE", "R-squared", "AdjR-squared"], | |
"Value": [mape, r_squared, adj_r_squared], | |
} | |
) | |
fig.update_layout( | |
xaxis=dict(title="Date"), | |
yaxis=dict(title="Value"), | |
title=f"MAPE : {mape:.2f}%, AdjR2: {adj_r_squared:.2f}", | |
xaxis_tickangle=-30, | |
) | |
return metrics_table, fig | |
def contributions(X, model): | |
X1 = X.copy() | |
for j, col in enumerate(X1.columns): | |
X1[col] = X1[col] * model.params.values[j] | |
return np.round( | |
(X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2 | |
) | |
transformed_data = pd.read_csv("transformed_data.csv") | |
# hard coded for now, need to get features set from model | |
feature_set_dct = { | |
"app_installs_-_appsflyer": [ | |
"paid_search_clicks", | |
"fb:_level_achieved_-_tier_1_impressions_lag2", | |
"fb:_level_achieved_-_tier_2_clicks_lag2", | |
"paid_social_others_impressions_adst.1", | |
"ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag2", | |
"digital_tactic_others_clicks", | |
"kwai_clicks_adst.3", | |
"programmaticclicks", | |
"indicacao_clicks_adst.1", | |
"infleux_clicks_adst.4", | |
"influencer_clicks", | |
], | |
"account_requests_-_appsflyer": [ | |
"paid_search_impressions", | |
"fb:_level_achieved_-_tier_1_clicks_adst.1", | |
"fb:_level_achieved_-_tier_2_clicks_adst.1", | |
"paid_social_others_clicks_lag2", | |
"ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag5_adst.1", | |
"digital_tactic_others_clicks_adst.1", | |
"kwai_clicks_adst.2", | |
"programmaticimpressions_lag4_adst.1", | |
"indicacao_clicks", | |
"infleux_clicks_adst.2", | |
"influencer_clicks", | |
], | |
"total_approved_accounts_-_appsflyer": [ | |
"paid_search_clicks", | |
"fb:_level_achieved_-_tier_1_impressions_lag2_adst.1", | |
"fb:_level_achieved_-_tier_2_impressions_lag2", | |
"paid_social_others_clicks_lag2_adst.2", | |
"ga_app:_will_and_cid_pequena_baixo_risco_impressions_lag4", | |
"digital_tactic_others_clicks", | |
"kwai_impressions_adst.2", | |
"programmaticclicks_adst.5", | |
"indicacao_clicks_adst.1", | |
"infleux_clicks_adst.3", | |
"influencer_clicks", | |
], | |
"total_approved_accounts_-_revenue": [ | |
"paid_search_impressions_adst.5", | |
"kwai_impressions_lag2_adst.3", | |
"indicacao_clicks_adst.3", | |
"infleux_clicks_adst.3", | |
"programmaticclicks_adst.4", | |
"influencer_clicks_adst.3", | |
"fb:_level_achieved_-_tier_1_impressions_adst.2", | |
"fb:_level_achieved_-_tier_2_impressions_lag3_adst.5", | |
"paid_social_others_impressions_adst.3", | |
"ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag3_adst.5", | |
"digital_tactic_others_clicks_adst.2", | |
], | |
} | |
# """ the above part should be modified so that we are fetching features set from the saved model""" | |
def contributions(X, model, target): | |
X1 = X.copy() | |
for j, col in enumerate(X1.columns): | |
X1[col] = X1[col] * model.params.values[j] | |
contributions = np.round( | |
(X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2 | |
) | |
contributions = ( | |
pd.DataFrame(contributions, columns=target) | |
.reset_index() | |
.rename(columns={"index": "Channel"}) | |
) | |
contributions["Channel"] = [ | |
re.split(r"_imp|_cli", col)[0] for col in contributions["Channel"] | |
] | |
return contributions | |
def model_fit(features_set, target): | |
X = transformed_data[features_set] | |
y = transformed_data[target] | |
ss = MinMaxScaler() | |
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns) | |
X = sm.add_constant(X) | |
X_train = X.iloc[:150] | |
X_test = X.iloc[150:] | |
y_train = y.iloc[:150] | |
y_test = y.iloc[150:] | |
model = sm.OLS(y_train, X_train).fit() | |
predicted_values_train = model.predict(X_train) | |
r2 = model.rsquared | |
adjr2 = model.rsquared_adj | |
train_mape = mean_absolute_percentage_error(y_train, predicted_values_train) | |
test_mape = mean_absolute_percentage_error(y_test, model.predict(X_test)) | |
summary = model.summary() | |
train_contributions = contributions(X_train, model, [target]) | |
return ( | |
pd.DataFrame( | |
{ | |
"Model": target, | |
"R2": np.round(r2, 2), | |
"ADJr2": np.round(adjr2, 2), | |
"Train Mape": np.round(train_mape, 2), | |
"Test Mape": np.round(test_mape, 2), | |
"Summary": summary, | |
"Model_object": model, | |
}, | |
index=[0], | |
), | |
train_contributions, | |
) | |
metrics_table = pd.DataFrame() | |
if "contribution_df" not in st.session_state: | |
st.session_state["contribution_df"] = pd.DataFrame() | |
for target, feature_set in feature_set_dct.items(): | |
metrics_table = pd.concat( | |
[metrics_table, model_fit(features_set=feature_set, target=target)[0]] | |
) | |
if st.session_state["contribution_df"].empty: | |
st.session_state["contribution_df"] = model_fit( | |
features_set=feature_set, target=target | |
)[1] | |
else: | |
st.session_state["contribution_df"] = pd.merge( | |
st.session_state["contribution_df"], | |
model_fit(features_set=feature_set, target=target)[1], | |
) | |
# st.write(st.session_state["contribution_df"]) | |
metrics_table.reset_index(drop=True, inplace=True) | |
eda_columns = st.columns(2) | |
with eda_columns[1]: | |
eda = st.button( | |
"Generate EDA Report", | |
help="Click to generate a bivariate report for the selected response metric from the table below.", | |
) | |
# st.markdown('Model Metrics') | |
st.title("Contribution Overview") | |
contribution_selections = st.multiselect( | |
"Select the models to compare contributions", | |
[ | |
col | |
for col in st.session_state["contribution_df"].columns | |
if col.lower() != "channel" | |
], | |
default=[ | |
col | |
for col in st.session_state["contribution_df"].columns | |
if col.lower() != "channel" | |
][-1], | |
) | |
trace_data = [] | |
for selection in contribution_selections: | |
trace = go.Bar( | |
x=st.session_state["contribution_df"]["Channel"], | |
y=st.session_state["contribution_df"][selection], | |
name=selection, | |
text=np.round(st.session_state["contribution_df"][selection], 0) | |
.astype(int) | |
.astype(str) | |
+ "%", | |
textposition="outside", | |
) | |
trace_data.append(trace) | |
layout = go.Layout( | |
title="Metrics Contribution by Channel", | |
xaxis=dict(title="Channel Name"), | |
yaxis=dict(title="Metrics Contribution"), | |
barmode="group", | |
) | |
fig = go.Figure(data=trace_data, layout=layout) | |
st.plotly_chart(fig, use_container_width=True) | |
############################################ Waterfall Chart ############################################ | |
# import plotly.graph_objects as go | |
# # Initialize a Plotly figure | |
# fig = go.Figure() | |
# for selection in contribution_selections: | |
# # Ensure y_values are numeric | |
# y_values = st.session_state["contribution_df"][selection].values.astype(float) | |
# # Generating text labels for each bar, ensuring operations are compatible with string formats | |
# text_values = [f"{val}%" for val in np.round(y_values, 0).astype(int)] | |
# fig.add_trace( | |
# go.Waterfall( | |
# name=selection, | |
# orientation="v", | |
# measure=["relative"] | |
# * len(y_values), # Adjust if you have absolute values at certain points | |
# x=st.session_state["contribution_df"]["Channel"].tolist(), | |
# text=text_values, | |
# textposition="outside", | |
# y=y_values, | |
# increasing={"marker": {"color": "green"}}, | |
# decreasing={"marker": {"color": "red"}}, | |
# totals={"marker": {"color": "blue"}}, | |
# ) | |
# ) | |
# fig.update_layout( | |
# title="Metrics Contribution by Channel", | |
# xaxis={"title": "Channel Name"}, | |
# yaxis={"title": "Metrics Contribution"}, | |
# height=600, | |
# ) | |
# # Displaying the waterfall chart in Streamlit | |
# st.plotly_chart(fig, use_container_width=True) | |
import plotly.graph_objects as go | |
# Initialize a Plotly figure | |
fig = go.Figure() | |
for selection in contribution_selections: | |
# Ensure contributions are numeric | |
contributions = ( | |
st.session_state["contribution_df"][selection].values.astype(float).tolist() | |
) | |
channel_names = st.session_state["contribution_df"]["Channel"].tolist() | |
display_name, display_contribution, base_contribution = [], [], 0 | |
for channel_name, contribution in zip(channel_names, contributions): | |
if channel_name != "const": | |
display_name.append(channel_name) | |
display_contribution.append(contribution) | |
else: | |
base_contribution = contribution | |
display_name = ["Base Sales"] + display_name | |
display_contribution = [base_contribution] + display_contribution | |
# Generating text labels for each bar, ensuring operations are compatible with string formats | |
text_values = [ | |
f"{val}%" for val in np.round(display_contribution, 0).astype(int) | |
] | |
fig.add_trace( | |
go.Waterfall( | |
orientation="v", | |
measure=["relative"] | |
* len( | |
display_contribution | |
), # Adjust if you have absolute values at certain points | |
x=display_name, | |
text=text_values, | |
textposition="outside", | |
y=display_contribution, | |
increasing={"marker": {"color": "green"}}, | |
decreasing={"marker": {"color": "red"}}, | |
totals={"marker": {"color": "blue"}}, | |
) | |
) | |
fig.update_layout( | |
title="Metrics Contribution by Channel", | |
xaxis={"title": "Channel Name"}, | |
yaxis={"title": "Metrics Contribution"}, | |
height=600, | |
) | |
# Displaying the waterfall chart in Streamlit | |
st.plotly_chart(fig, use_container_width=True) | |
############################################ Waterfall Chart ############################################ | |
st.title("Analysis of Models Result") | |
# st.markdown() | |
gd_table = metrics_table.iloc[:, :-2] | |
gd = GridOptionsBuilder.from_dataframe(gd_table) | |
# gd.configure_pagination(enabled=True) | |
gd.configure_selection( | |
use_checkbox=True, | |
selection_mode="single", | |
pre_select_all_rows=False, | |
pre_selected_rows=[1], | |
) | |
gridoptions = gd.build() | |
table = AgGrid( | |
gd_table, gridOptions=gridoptions, fit_columns_on_grid_load=True, height=200 | |
) | |
# table=metrics_table.iloc[:,:-2] | |
# table.insert(0, "Select", False) | |
# selection_table=st.data_editor(table,column_config={"Select": st.column_config.CheckboxColumn(required=True)}) | |
if len(table.selected_rows) == 0: | |
st.warning( | |
"Click on the checkbox to view comprehensive results of the selected model." | |
) | |
st.stop() | |
else: | |
target_column = table.selected_rows[0]["Model"] | |
feature_set = feature_set_dct[target_column] | |
with eda_columns[1]: | |
if eda: | |
def generate_report_with_target(channel_data, target_feature): | |
report = sv.analyze( | |
[channel_data, "Dataset"], target_feat=target_feature, verbose=False | |
) | |
temp_dir = tempfile.mkdtemp() | |
report_path = os.path.join(temp_dir, "report.html") | |
report.show_html( | |
filepath=report_path, open_browser=False | |
) # Generate the report as an HTML file | |
return report_path | |
report_data = transformed_data[feature_set] | |
report_data[target_column] = transformed_data[target_column] | |
report_file = generate_report_with_target(report_data, target_column) | |
if os.path.exists(report_file): | |
with open(report_file, "rb") as f: | |
st.download_button( | |
label="Download EDA Report", | |
data=f.read(), | |
file_name="report.html", | |
mime="text/html", | |
) | |
else: | |
st.warning("Report generation failed. Unable to find the report file.") | |
model = metrics_table[metrics_table["Model"] == target_column]["Model_object"].iloc[ | |
0 | |
] | |
st.header("Model Summary") | |
st.write(model.summary()) | |
X = transformed_data[feature_set] | |
ss = MinMaxScaler() | |
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns) | |
X = sm.add_constant(X) | |
y = transformed_data[target_column] | |
X_train = X.iloc[:150] | |
X_test = X.iloc[150:] | |
y_train = y.iloc[:150] | |
y_test = y.iloc[150:] | |
X.index = transformed_data["date"] | |
y.index = transformed_data["date"] | |
metrics_table_train, fig_train = plot_actual_vs_predicted( | |
X_train.index, y_train, model.predict(X_train), model | |
) | |
metrics_table_test, fig_test = plot_actual_vs_predicted( | |
X_test.index, y_test, model.predict(X_test), model | |
) | |
metrics_table_train = metrics_table_train.set_index("Metric").transpose() | |
metrics_table_train.index = ["Train"] | |
metrics_table_test = metrics_table_test.set_index("Metric").transpose() | |
metrics_table_test.index = ["test"] | |
metrics_table = np.round(pd.concat([metrics_table_train, metrics_table_test]), 2) | |
st.markdown("Result Overview") | |
st.dataframe(np.round(metrics_table, 2), use_container_width=True) | |
st.subheader("Actual vs Predicted Plot Train") | |
st.plotly_chart(fig_train, use_container_width=True) | |
st.subheader("Actual vs Predicted Plot Test") | |
st.plotly_chart(fig_test, use_container_width=True) | |
st.markdown("## Residual Analysis") | |
columns = st.columns(2) | |
Xtrain1 = X_train.copy() | |
with columns[0]: | |
fig = plot_residual_predicted(y_train, model.predict(Xtrain1), Xtrain1) | |
st.plotly_chart(fig) | |
with columns[1]: | |
st.empty() | |
fig = qqplot(y_train, model.predict(X_train)) | |
st.plotly_chart(fig) | |
with columns[0]: | |
fig = residual_distribution(y_train, model.predict(X_train)) | |
st.pyplot(fig) | |
elif auth_status == False: | |
st.error("Username/Password is incorrect") | |
try: | |
username_forgot_pw, email_forgot_password, random_password = ( | |
authenticator.forgot_password("Forgot password") | |
) | |
if username_forgot_pw: | |
st.success("New password sent securely") | |
# Random password to be transferred to the user securely | |
elif username_forgot_pw == False: | |
st.error("Username not found") | |
except Exception as e: | |
st.error(e) | |