Spaces:

BlendMMM
/

Mastercard

Sleeping

File size: 20,964 Bytes

bd80083

import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import streamlit as st
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_percentage_error
import sys
import os
from utilities import set_header, load_local_css, load_authenticator
import seaborn as sns
import matplotlib.pyplot as plt
import sweetviz as sv
import tempfile
from sklearn.preprocessing import MinMaxScaler
from st_aggrid import AgGrid
from st_aggrid import GridOptionsBuilder, GridUpdateMode
from st_aggrid import GridOptionsBuilder
import sys
import re

sys.setrecursionlimit(10**6)

original_stdout = sys.stdout
sys.stdout = open("temp_stdout.txt", "w")
sys.stdout.close()
sys.stdout = original_stdout

st.set_page_config(layout="wide")
load_local_css("styles.css")
set_header()

for k, v in st.session_state.items():
    if k not in ["logout", "login", "config"] and not k.startswith("FormSubmitter"):
        st.session_state[k] = v

authenticator = st.session_state.get("authenticator")
if authenticator is None:
    authenticator = load_authenticator()

name, authentication_status, username = authenticator.login("Login", "main")
auth_status = st.session_state.get("authentication_status")

if auth_status == True:
    is_state_initiaized = st.session_state.get("initialized", False)
    if not is_state_initiaized:
        a = 1

    def plot_residual_predicted(actual, predicted, df_):
        df_["Residuals"] = actual - pd.Series(predicted)
        df_["StdResidual"] = (df_["Residuals"] - df_["Residuals"].mean()) / df_[
            "Residuals"
        ].std()

        # Create a Plotly scatter plot
        fig = px.scatter(
            df_,
            x=predicted,
            y="StdResidual",
            opacity=0.5,
            color_discrete_sequence=["#11B6BD"],
        )

        # Add horizontal lines
        fig.add_hline(y=0, line_dash="dash", line_color="darkorange")
        fig.add_hline(y=2, line_color="red")
        fig.add_hline(y=-2, line_color="red")

        fig.update_xaxes(title="Predicted")
        fig.update_yaxes(title="Standardized Residuals (Actual - Predicted)")

        # Set the same width and height for both figures
        fig.update_layout(
            title="Residuals over Predicted Values",
            autosize=False,
            width=600,
            height=400,
        )

        return fig

    def residual_distribution(actual, predicted):
        Residuals = actual - pd.Series(predicted)

        # Create a Seaborn distribution plot
        sns.set(style="whitegrid")
        plt.figure(figsize=(6, 4))
        sns.histplot(Residuals, kde=True, color="#11B6BD")

        plt.title(" Distribution of Residuals")
        plt.xlabel("Residuals")
        plt.ylabel("Probability Density")

        return plt

    def qqplot(actual, predicted):
        Residuals = actual - pd.Series(predicted)
        Residuals = pd.Series(Residuals)
        Resud_std = (Residuals - Residuals.mean()) / Residuals.std()

        # Create a QQ plot using Plotly with custom colors
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=sm.ProbPlot(Resud_std).theoretical_quantiles,
                y=sm.ProbPlot(Resud_std).sample_quantiles,
                mode="markers",
                marker=dict(size=5, color="#11B6BD"),
                name="QQ Plot",
            )
        )

        # Add the 45-degree reference line
        diagonal_line = go.Scatter(
            x=[-2, 2],  # Adjust the x values as needed to fit the range of your data
            y=[-2, 2],  # Adjust the y values accordingly
            mode="lines",
            line=dict(color="red"),  # Customize the line color and style
            name=" ",
        )
        fig.add_trace(diagonal_line)

        # Customize the layout
        fig.update_layout(
            title="QQ Plot of Residuals",
            title_x=0.5,
            autosize=False,
            width=600,
            height=400,
            xaxis_title="Theoretical Quantiles",
            yaxis_title="Sample Quantiles",
        )

        return fig

    def plot_actual_vs_predicted(date, y, predicted_values, model):

        fig = go.Figure()

        fig.add_trace(
            go.Scatter(
                x=date, y=y, mode="lines", name="Actual", line=dict(color="blue")
            )
        )
        fig.add_trace(
            go.Scatter(
                x=date,
                y=predicted_values,
                mode="lines",
                name="Predicted",
                line=dict(color="orange"),
            )
        )

        # Calculate MAPE
        mape = mean_absolute_percentage_error(y, predicted_values) * 100

        # Calculate R-squared
        rss = np.sum((y - predicted_values) ** 2)
        tss = np.sum((y - np.mean(y)) ** 2)
        r_squared = 1 - (rss / tss)

        # Get the number of predictors
        num_predictors = model.df_model

        # Get the number of samples
        num_samples = len(y)

        # Calculate Adjusted R-squared
        adj_r_squared = 1 - (
            (1 - r_squared) * ((num_samples - 1) / (num_samples - num_predictors - 1))
        )
        metrics_table = pd.DataFrame(
            {
                "Metric": ["MAPE", "R-squared", "AdjR-squared"],
                "Value": [mape, r_squared, adj_r_squared],
            }
        )
        fig.update_layout(
            xaxis=dict(title="Date"),
            yaxis=dict(title="Value"),
            title=f"MAPE : {mape:.2f}%, AdjR2: {adj_r_squared:.2f}",
            xaxis_tickangle=-30,
        )

        return metrics_table, fig

    def contributions(X, model):
        X1 = X.copy()
        for j, col in enumerate(X1.columns):
            X1[col] = X1[col] * model.params.values[j]

        return np.round(
            (X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2
        )

    transformed_data = pd.read_csv("transformed_data.csv")

    # hard coded for now, need to get features set from model

    feature_set_dct = {
        "app_installs_-_appsflyer": [
            "paid_search_clicks",
            "fb:_level_achieved_-_tier_1_impressions_lag2",
            "fb:_level_achieved_-_tier_2_clicks_lag2",
            "paid_social_others_impressions_adst.1",
            "ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag2",
            "digital_tactic_others_clicks",
            "kwai_clicks_adst.3",
            "programmaticclicks",
            "indicacao_clicks_adst.1",
            "infleux_clicks_adst.4",
            "influencer_clicks",
        ],
        "account_requests_-_appsflyer": [
            "paid_search_impressions",
            "fb:_level_achieved_-_tier_1_clicks_adst.1",
            "fb:_level_achieved_-_tier_2_clicks_adst.1",
            "paid_social_others_clicks_lag2",
            "ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag5_adst.1",
            "digital_tactic_others_clicks_adst.1",
            "kwai_clicks_adst.2",
            "programmaticimpressions_lag4_adst.1",
            "indicacao_clicks",
            "infleux_clicks_adst.2",
            "influencer_clicks",
        ],
        "total_approved_accounts_-_appsflyer": [
            "paid_search_clicks",
            "fb:_level_achieved_-_tier_1_impressions_lag2_adst.1",
            "fb:_level_achieved_-_tier_2_impressions_lag2",
            "paid_social_others_clicks_lag2_adst.2",
            "ga_app:_will_and_cid_pequena_baixo_risco_impressions_lag4",
            "digital_tactic_others_clicks",
            "kwai_impressions_adst.2",
            "programmaticclicks_adst.5",
            "indicacao_clicks_adst.1",
            "infleux_clicks_adst.3",
            "influencer_clicks",
        ],
        "total_approved_accounts_-_revenue": [
            "paid_search_impressions_adst.5",
            "kwai_impressions_lag2_adst.3",
            "indicacao_clicks_adst.3",
            "infleux_clicks_adst.3",
            "programmaticclicks_adst.4",
            "influencer_clicks_adst.3",
            "fb:_level_achieved_-_tier_1_impressions_adst.2",
            "fb:_level_achieved_-_tier_2_impressions_lag3_adst.5",
            "paid_social_others_impressions_adst.3",
            "ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag3_adst.5",
            "digital_tactic_others_clicks_adst.2",
        ],
    }

    # """ the above part should be modified so that we are fetching features set from the saved model"""

    def contributions(X, model, target):
        X1 = X.copy()
        for j, col in enumerate(X1.columns):
            X1[col] = X1[col] * model.params.values[j]

        contributions = np.round(
            (X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2
        )
        contributions = (
            pd.DataFrame(contributions, columns=target)
            .reset_index()
            .rename(columns={"index": "Channel"})
        )
        contributions["Channel"] = [
            re.split(r"_imp|_cli", col)[0] for col in contributions["Channel"]
        ]

        return contributions

    def model_fit(features_set, target):
        X = transformed_data[features_set]
        y = transformed_data[target]
        ss = MinMaxScaler()
        X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
        X = sm.add_constant(X)
        X_train = X.iloc[:150]
        X_test = X.iloc[150:]
        y_train = y.iloc[:150]
        y_test = y.iloc[150:]
        model = sm.OLS(y_train, X_train).fit()
        predicted_values_train = model.predict(X_train)
        r2 = model.rsquared
        adjr2 = model.rsquared_adj
        train_mape = mean_absolute_percentage_error(y_train, predicted_values_train)
        test_mape = mean_absolute_percentage_error(y_test, model.predict(X_test))
        summary = model.summary()
        train_contributions = contributions(X_train, model, [target])
        return (
            pd.DataFrame(
                {
                    "Model": target,
                    "R2": np.round(r2, 2),
                    "ADJr2": np.round(adjr2, 2),
                    "Train Mape": np.round(train_mape, 2),
                    "Test Mape": np.round(test_mape, 2),
                    "Summary": summary,
                    "Model_object": model,
                },
                index=[0],
            ),
            train_contributions,
        )

    metrics_table = pd.DataFrame()

    if "contribution_df" not in st.session_state:
        st.session_state["contribution_df"] = pd.DataFrame()

    for target, feature_set in feature_set_dct.items():
        metrics_table = pd.concat(
            [metrics_table, model_fit(features_set=feature_set, target=target)[0]]
        )
        if st.session_state["contribution_df"].empty:
            st.session_state["contribution_df"] = model_fit(
                features_set=feature_set, target=target
            )[1]
        else:
            st.session_state["contribution_df"] = pd.merge(
                st.session_state["contribution_df"],
                model_fit(features_set=feature_set, target=target)[1],
            )

    # st.write(st.session_state["contribution_df"])

    metrics_table.reset_index(drop=True, inplace=True)

    eda_columns = st.columns(2)
    with eda_columns[1]:
        eda = st.button(
            "Generate EDA Report",
            help="Click to generate a bivariate report for the selected response metric from the table below.",
        )

    # st.markdown('Model Metrics')

    st.title("Contribution Overview")

    contribution_selections = st.multiselect(
        "Select the models to compare contributions",
        [
            col
            for col in st.session_state["contribution_df"].columns
            if col.lower() != "channel"
        ],
        default=[
            col
            for col in st.session_state["contribution_df"].columns
            if col.lower() != "channel"
        ][-1],
    )
    trace_data = []

    for selection in contribution_selections:

        trace = go.Bar(
            x=st.session_state["contribution_df"]["Channel"],
            y=st.session_state["contribution_df"][selection],
            name=selection,
            text=np.round(st.session_state["contribution_df"][selection], 0)
            .astype(int)
            .astype(str)
            + "%",
            textposition="outside",
        )
        trace_data.append(trace)

    layout = go.Layout(
        title="Metrics Contribution by Channel",
        xaxis=dict(title="Channel Name"),
        yaxis=dict(title="Metrics Contribution"),
        barmode="group",
    )
    fig = go.Figure(data=trace_data, layout=layout)
    st.plotly_chart(fig, use_container_width=True)

    ############################################ Waterfall Chart ############################################
    # import plotly.graph_objects as go

    # # Initialize a Plotly figure
    # fig = go.Figure()

    # for selection in contribution_selections:
    #     # Ensure y_values are numeric
    #     y_values = st.session_state["contribution_df"][selection].values.astype(float)

    #     # Generating text labels for each bar, ensuring operations are compatible with string formats
    #     text_values = [f"{val}%" for val in np.round(y_values, 0).astype(int)]

    #     fig.add_trace(
    #         go.Waterfall(
    #             name=selection,
    #             orientation="v",
    #             measure=["relative"]
    #             * len(y_values),  # Adjust if you have absolute values at certain points
    #             x=st.session_state["contribution_df"]["Channel"].tolist(),
    #             text=text_values,
    #             textposition="outside",
    #             y=y_values,
    #             increasing={"marker": {"color": "green"}},
    #             decreasing={"marker": {"color": "red"}},
    #             totals={"marker": {"color": "blue"}},
    #         )
    #     )

    # fig.update_layout(
    #     title="Metrics Contribution by Channel",
    #     xaxis={"title": "Channel Name"},
    #     yaxis={"title": "Metrics Contribution"},
    #     height=600,
    # )

    # # Displaying the waterfall chart in Streamlit
    # st.plotly_chart(fig, use_container_width=True)

    import plotly.graph_objects as go

    # Initialize a Plotly figure
    fig = go.Figure()

    for selection in contribution_selections:
        # Ensure contributions are numeric
        contributions = (
            st.session_state["contribution_df"][selection].values.astype(float).tolist()
        )
        channel_names = st.session_state["contribution_df"]["Channel"].tolist()

        display_name, display_contribution, base_contribution = [], [], 0
        for channel_name, contribution in zip(channel_names, contributions):
            if channel_name != "const":
                display_name.append(channel_name)
                display_contribution.append(contribution)
            else:
                base_contribution = contribution

        display_name = ["Base Sales"] + display_name
        display_contribution = [base_contribution] + display_contribution

        # Generating text labels for each bar, ensuring operations are compatible with string formats
        text_values = [
            f"{val}%" for val in np.round(display_contribution, 0).astype(int)
        ]

        fig.add_trace(
            go.Waterfall(
                orientation="v",
                measure=["relative"]
                * len(
                    display_contribution
                ),  # Adjust if you have absolute values at certain points
                x=display_name,
                text=text_values,
                textposition="outside",
                y=display_contribution,
                increasing={"marker": {"color": "green"}},
                decreasing={"marker": {"color": "red"}},
                totals={"marker": {"color": "blue"}},
            )
        )

    fig.update_layout(
        title="Metrics Contribution by Channel",
        xaxis={"title": "Channel Name"},
        yaxis={"title": "Metrics Contribution"},
        height=600,
    )

    # Displaying the waterfall chart in Streamlit
    st.plotly_chart(fig, use_container_width=True)

    ############################################ Waterfall Chart ############################################

    st.title("Analysis of Models Result")
    # st.markdown()
    gd_table = metrics_table.iloc[:, :-2]

    gd = GridOptionsBuilder.from_dataframe(gd_table)
    # gd.configure_pagination(enabled=True)
    gd.configure_selection(
        use_checkbox=True,
        selection_mode="single",
        pre_select_all_rows=False,
        pre_selected_rows=[1],
    )

    gridoptions = gd.build()
    table = AgGrid(
        gd_table, gridOptions=gridoptions, fit_columns_on_grid_load=True, height=200
    )
    # table=metrics_table.iloc[:,:-2]
    # table.insert(0, "Select", False)
    # selection_table=st.data_editor(table,column_config={"Select": st.column_config.CheckboxColumn(required=True)})

    if len(table.selected_rows) == 0:
        st.warning(
            "Click on the checkbox to view comprehensive results of the selected model."
        )
        st.stop()
    else:
        target_column = table.selected_rows[0]["Model"]
        feature_set = feature_set_dct[target_column]

    with eda_columns[1]:
        if eda:

            def generate_report_with_target(channel_data, target_feature):
                report = sv.analyze(
                    [channel_data, "Dataset"], target_feat=target_feature, verbose=False
                )
                temp_dir = tempfile.mkdtemp()
                report_path = os.path.join(temp_dir, "report.html")
                report.show_html(
                    filepath=report_path, open_browser=False
                )  # Generate the report as an HTML file
                return report_path

            report_data = transformed_data[feature_set]
            report_data[target_column] = transformed_data[target_column]
            report_file = generate_report_with_target(report_data, target_column)

            if os.path.exists(report_file):
                with open(report_file, "rb") as f:
                    st.download_button(
                        label="Download EDA Report",
                        data=f.read(),
                        file_name="report.html",
                        mime="text/html",
                    )
            else:
                st.warning("Report generation failed. Unable to find the report file.")

    model = metrics_table[metrics_table["Model"] == target_column]["Model_object"].iloc[
        0
    ]
    st.header("Model Summary")
    st.write(model.summary())
    X = transformed_data[feature_set]
    ss = MinMaxScaler()
    X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
    X = sm.add_constant(X)
    y = transformed_data[target_column]
    X_train = X.iloc[:150]
    X_test = X.iloc[150:]
    y_train = y.iloc[:150]
    y_test = y.iloc[150:]
    X.index = transformed_data["date"]
    y.index = transformed_data["date"]

    metrics_table_train, fig_train = plot_actual_vs_predicted(
        X_train.index, y_train, model.predict(X_train), model
    )
    metrics_table_test, fig_test = plot_actual_vs_predicted(
        X_test.index, y_test, model.predict(X_test), model
    )

    metrics_table_train = metrics_table_train.set_index("Metric").transpose()
    metrics_table_train.index = ["Train"]
    metrics_table_test = metrics_table_test.set_index("Metric").transpose()
    metrics_table_test.index = ["test"]
    metrics_table = np.round(pd.concat([metrics_table_train, metrics_table_test]), 2)

    st.markdown("Result Overview")
    st.dataframe(np.round(metrics_table, 2), use_container_width=True)

    st.subheader("Actual vs Predicted Plot Train")

    st.plotly_chart(fig_train, use_container_width=True)
    st.subheader("Actual vs Predicted Plot Test")
    st.plotly_chart(fig_test, use_container_width=True)

    st.markdown("## Residual Analysis")
    columns = st.columns(2)

    Xtrain1 = X_train.copy()
    with columns[0]:
        fig = plot_residual_predicted(y_train, model.predict(Xtrain1), Xtrain1)
        st.plotly_chart(fig)

    with columns[1]:
        st.empty()
        fig = qqplot(y_train, model.predict(X_train))
        st.plotly_chart(fig)

    with columns[0]:
        fig = residual_distribution(y_train, model.predict(X_train))
        st.pyplot(fig)


elif auth_status == False:
    st.error("Username/Password is incorrect")
    try:
        username_forgot_pw, email_forgot_password, random_password = (
            authenticator.forgot_password("Forgot password")
        )
        if username_forgot_pw:
            st.success("New password sent securely")
            # Random password to be transferred to the user securely
        elif username_forgot_pw == False:
            st.error("Username not found")
    except Exception as e:
        st.error(e)