Spaces:

BlendMMM
/

Mastercard

Sleeping

File size: 17,827 Bytes

a660599

import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import streamlit as st
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_percentage_error
import sys
import os
from utilities import (set_header, 
                       load_local_css,
                       load_authenticator)
import seaborn as sns
import matplotlib.pyplot as plt
import sweetviz as sv
import tempfile
from sklearn.preprocessing import MinMaxScaler
from st_aggrid import AgGrid
from st_aggrid import GridOptionsBuilder,GridUpdateMode
from st_aggrid import GridOptionsBuilder
import sys
import re

sys.setrecursionlimit(10**6)

original_stdout = sys.stdout
sys.stdout = open('temp_stdout.txt', 'w')
sys.stdout.close()
sys.stdout = original_stdout

st.set_page_config(layout='wide')
load_local_css('styles.css')
set_header()

for k, v in st.session_state.items():
    if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
        st.session_state[k] = v

authenticator = st.session_state.get('authenticator')
if authenticator is None:
    authenticator = load_authenticator()

name, authentication_status, username = authenticator.login('Login', 'main')
auth_status = st.session_state.get('authentication_status')

if auth_status == True:
    is_state_initiaized = st.session_state.get('initialized',False)
    if not is_state_initiaized:
        a=1
      

    def plot_residual_predicted(actual, predicted, df_):
            df_['Residuals'] = actual - pd.Series(predicted)
            df_['StdResidual'] = (df_['Residuals'] - df_['Residuals'].mean()) / df_['Residuals'].std()
            
            # Create a Plotly scatter plot
            fig = px.scatter(df_, x=predicted, y='StdResidual', opacity=0.5,color_discrete_sequence=["#11B6BD"])
            
            # Add horizontal lines
            fig.add_hline(y=0, line_dash="dash", line_color="darkorange")
            fig.add_hline(y=2, line_color="red")
            fig.add_hline(y=-2, line_color="red")
            
            fig.update_xaxes(title='Predicted')
            fig.update_yaxes(title='Standardized Residuals (Actual - Predicted)')
            
            # Set the same width and height for both figures
            fig.update_layout(title='Residuals over Predicted Values', autosize=False, width=600, height=400)
            
            return fig

    def residual_distribution(actual, predicted):
            Residuals = actual - pd.Series(predicted)
            
            # Create a Seaborn distribution plot
            sns.set(style="whitegrid")
            plt.figure(figsize=(6, 4))
            sns.histplot(Residuals, kde=True, color="#11B6BD")
            
            plt.title(' Distribution of Residuals')
            plt.xlabel('Residuals')
            plt.ylabel('Probability Density')
            
            return plt

    
    def qqplot(actual, predicted):
            Residuals = actual - pd.Series(predicted)
            Residuals = pd.Series(Residuals)
            Resud_std = (Residuals - Residuals.mean()) / Residuals.std()
            
            # Create a QQ plot using Plotly with custom colors
            fig = go.Figure()
            fig.add_trace(go.Scatter(x=sm.ProbPlot(Resud_std).theoretical_quantiles,
                                    y=sm.ProbPlot(Resud_std).sample_quantiles,
                                    mode='markers',
                                    marker=dict(size=5, color="#11B6BD"),
                                    name='QQ Plot'))
            
            # Add the 45-degree reference line
            diagonal_line = go.Scatter(
                x=[-2, 2],  # Adjust the x values as needed to fit the range of your data
                y=[-2, 2],  # Adjust the y values accordingly
                mode='lines',
                line=dict(color='red'),  # Customize the line color and style
                name=' '
            )
            fig.add_trace(diagonal_line)
            
            # Customize the layout
            fig.update_layout(title='QQ Plot of Residuals',title_x=0.5, autosize=False, width=600, height=400,
                            xaxis_title='Theoretical Quantiles', yaxis_title='Sample Quantiles')
            
            return fig


    def plot_actual_vs_predicted(date, y, predicted_values, model):

        fig = go.Figure()

        fig.add_trace(go.Scatter(x=date, y=y, mode='lines', name='Actual', line=dict(color='blue')))
        fig.add_trace(go.Scatter(x=date, y=predicted_values, mode='lines', name='Predicted', line=dict(color='orange')))
        
        # Calculate MAPE
        mape = mean_absolute_percentage_error(y, predicted_values)*100
        
        # Calculate R-squared
        rss = np.sum((y - predicted_values) ** 2)
        tss = np.sum((y - np.mean(y)) ** 2)
        r_squared = 1 - (rss / tss)
        
        # Get the number of predictors
        num_predictors = model.df_model
        
        # Get the number of samples
        num_samples = len(y)
        
        # Calculate Adjusted R-squared
        adj_r_squared = 1 - ((1 - r_squared) * ((num_samples - 1) / (num_samples - num_predictors - 1)))
        metrics_table = pd.DataFrame({
        'Metric': ['MAPE', 'R-squared', 'AdjR-squared'],
        'Value': [mape, r_squared, adj_r_squared]})
        fig.update_layout(
            xaxis=dict(title='Date'),
            yaxis=dict(title='Value'),
            title=f'MAPE : {mape:.2f}%, AdjR2: {adj_r_squared:.2f}',
            xaxis_tickangle=-30
        )

        return metrics_table,fig
    def contributions(X, model):
        X1 = X.copy()
        for j, col in enumerate(X1.columns):
            X1[col] = X1[col] * model.params.values[j]

        return np.round((X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2)

    transformed_data=pd.read_csv('transformed_data.csv')

    # hard coded for now, need to get features set from model

    feature_set_dct={'app_installs_-_appsflyer':['paid_search_clicks',
                                            'fb:_level_achieved_-_tier_1_impressions_lag2',
                                            'fb:_level_achieved_-_tier_2_clicks_lag2',
                                            'paid_social_others_impressions_adst.1',
                                            'ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag2',
                                            'digital_tactic_others_clicks',
                                            'kwai_clicks_adst.3',
                                            'programmaticclicks',
                                            'indicacao_clicks_adst.1',
                                            'infleux_clicks_adst.4',
                                            'influencer_clicks'],
                                            
                'account_requests_-_appsflyer':['paid_search_impressions',
                                                'fb:_level_achieved_-_tier_1_clicks_adst.1',
                                                'fb:_level_achieved_-_tier_2_clicks_adst.1',
                                                'paid_social_others_clicks_lag2',
                                                'ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag5_adst.1',
                                                'digital_tactic_others_clicks_adst.1',
                                                'kwai_clicks_adst.2',
                                                'programmaticimpressions_lag4_adst.1',
                                                'indicacao_clicks',
                                                'infleux_clicks_adst.2',
                                                'influencer_clicks'],

                'total_approved_accounts_-_appsflyer':['paid_search_clicks',
                                                        'fb:_level_achieved_-_tier_1_impressions_lag2_adst.1',
                                                        'fb:_level_achieved_-_tier_2_impressions_lag2',
                                                        'paid_social_others_clicks_lag2_adst.2',
                                                        'ga_app:_will_and_cid_pequena_baixo_risco_impressions_lag4',
                                                        'digital_tactic_others_clicks',
                                                        'kwai_impressions_adst.2',
                                                        'programmaticclicks_adst.5',
                                                        'indicacao_clicks_adst.1',
                                                        'infleux_clicks_adst.3',
                                                        'influencer_clicks'],

                'total_approved_accounts_-_revenue':['paid_search_impressions_adst.5',
                                                    'kwai_impressions_lag2_adst.3',
                                                    'indicacao_clicks_adst.3',
                                                    'infleux_clicks_adst.3',
                                                    'programmaticclicks_adst.4',
                                                    'influencer_clicks_adst.3',
                                                    'fb:_level_achieved_-_tier_1_impressions_adst.2',
                                                    'fb:_level_achieved_-_tier_2_impressions_lag3_adst.5',
                                                    'paid_social_others_impressions_adst.3',
                                                    'ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag3_adst.5',
                                                    'digital_tactic_others_clicks_adst.2']                       
                                            
                }

    #""" the above part should be modified so that we are fetching features set from the saved model"""



    def contributions(X, model,target):
        X1 = X.copy()
        for j, col in enumerate(X1.columns):
            X1[col] = X1[col] * model.params.values[j]
        
        contributions= np.round((X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2)
        contributions=pd.DataFrame(contributions,columns=target).reset_index().rename(columns={'index':'Channel'})
        contributions['Channel']=[ re.split(r'_imp|_cli', col)[0] for col in contributions['Channel']]
        
        return contributions
    

    def model_fit(features_set,target):
        X = transformed_data[features_set]
        y=  transformed_data[target]
        ss = MinMaxScaler()
        X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
        X = sm.add_constant(X)
        X_train=X.iloc[:150]
        X_test=X.iloc[150:]
        y_train=y.iloc[:150]
        y_test=y.iloc[150:]
        model = sm.OLS(y_train, X_train).fit()
        predicted_values_train = model.predict(X_train)
        r2 = model.rsquared
        adjr2 = model.rsquared_adj
        train_mape = mean_absolute_percentage_error(y_train, predicted_values_train)
        test_mape=mean_absolute_percentage_error(y_test, model.predict(X_test))
        summary=model.summary()
        train_contributions=contributions(X_train,model,[target])
        return pd.DataFrame({'Model':target,'R2':np.round(r2,2),'ADJr2':np.round(adjr2,2),'Train Mape':np.round(train_mape,2),
                             'Test Mape':np.round(test_mape,2),'Summary':summary,'Model_object':model
                             },index=[0]), train_contributions

    metrics_table=pd.DataFrame()

    if 'contribution_df' not in st.session_state:
        st.session_state["contribution_df"]=pd.DataFrame()

    for target,feature_set in feature_set_dct.items():
       metrics_table= pd.concat([metrics_table,model_fit(features_set=feature_set,target=target)[0]])
       if st.session_state["contribution_df"].empty:
           st.session_state["contribution_df"]= model_fit(features_set=feature_set,target=target)[1]
       else:
        st.session_state["contribution_df"]=pd.merge(st.session_state["contribution_df"],model_fit(features_set=feature_set,target=target)[1])

    # st.write(st.session_state["contribution_df"])
        

    metrics_table.reset_index(drop=True,inplace=True)



    
    



    eda_columns=st.columns(2)
    with eda_columns[1]:
        eda=st.button('Generate EDA Report',help="Click to generate a bivariate report for the selected response metric from the table below.")
    


    # st.markdown('Model Metrics')
        
    st.title('Contribution Overview')

    contribution_selections=st.multiselect('Select the models to compare contributions',[col for col in st.session_state['contribution_df'].columns if col.lower() != 'channel'   ],default=[col for col in st.session_state['contribution_df'].columns if col.lower() != 'channel'   ][-1])
    trace_data=[]

    for selection in contribution_selections:

        trace=go.Bar(x=st.session_state['contribution_df']['Channel'], y=st.session_state['contribution_df'][selection],name=selection,text=np.round(st.session_state['contribution_df'][selection],0).astype(int).astype(str)+'%',textposition='outside')
        trace_data.append(trace)

    layout = go.Layout(
    title='Metrics Contribution by Channel',
    xaxis=dict(title='Channel Name'),
    yaxis=dict(title='Metrics Contribution'),
    barmode='group'
                )
    fig = go.Figure(data=trace_data, layout=layout)
    st.plotly_chart(fig,use_container_width=True)

    st.title('Analysis of Models Result')
    #st.markdown()
    gd_table=metrics_table.iloc[:,:-2]
    gd=GridOptionsBuilder.from_dataframe(gd_table)
    #gd.configure_pagination(enabled=True)
    gd.configure_selection(use_checkbox=True)


    gridoptions=gd.build()
    table = AgGrid(gd_table,gridOptions=gridoptions,fit_columns_on_grid_load=True,height=200)
    # table=metrics_table.iloc[:,:-2]
    # table.insert(0, "Select", False)
    # selection_table=st.data_editor(table,column_config={"Select": st.column_config.CheckboxColumn(required=True)})
    


    if len(table.selected_rows)==0:
        st.warning("Click on the checkbox to view comprehensive results of the selected model.")
        st.stop()
    else:    
        target_column=table.selected_rows[0]['Model']
        feature_set=feature_set_dct[target_column]

    with eda_columns[1]:
        if eda:
            def generate_report_with_target(channel_data, target_feature):
                report = sv.analyze([channel_data, "Dataset"], target_feat=target_feature,verbose=False)
                temp_dir = tempfile.mkdtemp()
                report_path = os.path.join(temp_dir, "report.html")
                report.show_html(filepath=report_path, open_browser=False)  # Generate the report as an HTML file
                return report_path
            
            report_data=transformed_data[feature_set]
            report_data[target_column]=transformed_data[target_column]
            report_file = generate_report_with_target(report_data, target_column)
        
            if os.path.exists(report_file):
                with open(report_file, 'rb') as f:
                    st.download_button(
                        label="Download EDA Report",
                        data=f.read(),
                        file_name="report.html",
                        mime="text/html"
                    )
            else:
                st.warning("Report generation failed. Unable to find the report file.")



    model=metrics_table[metrics_table['Model']==target_column]['Model_object'].iloc[0]
    st.header('Model Summary')
    st.write(model.summary())
    X=transformed_data[feature_set]
    ss=MinMaxScaler()
    X=pd.DataFrame(ss.fit_transform(X),columns=X.columns)
    X=sm.add_constant(X)
    y=transformed_data[target_column]
    X_train=X.iloc[:150]
    X_test=X.iloc[150:]
    y_train=y.iloc[:150]
    y_test=y.iloc[150:]
    X.index=transformed_data['date']
    y.index=transformed_data['date']

    metrics_table_train,fig_train= plot_actual_vs_predicted(X_train.index, y_train, model.predict(X_train), model)
    metrics_table_test,fig_test= plot_actual_vs_predicted(X_test.index, y_test, model.predict(X_test), model)

    metrics_table_train=metrics_table_train.set_index('Metric').transpose()
    metrics_table_train.index=['Train']
    metrics_table_test=metrics_table_test.set_index('Metric').transpose()
    metrics_table_test.index=['test']
    metrics_table=np.round(pd.concat([metrics_table_train,metrics_table_test]),2) 

    st.markdown('Result Overview')
    st.dataframe(np.round(metrics_table,2),use_container_width=True)

    st.subheader('Actual vs Predicted Plot Train')

    st.plotly_chart(fig_train,use_container_width=True)
    st.subheader('Actual vs Predicted Plot Test')
    st.plotly_chart(fig_test,use_container_width=True)

    st.markdown('## Residual Analysis')
    columns=st.columns(2)


    Xtrain1=X_train.copy()
    with columns[0]:
        fig=plot_residual_predicted(y_train,model.predict(Xtrain1),Xtrain1)
        st.plotly_chart(fig)

    with columns[1]:
        st.empty()
        fig = qqplot(y_train,model.predict(X_train))
        st.plotly_chart(fig)

    with columns[0]:
        fig=residual_distribution(y_train,model.predict(X_train))
        st.pyplot(fig)



elif auth_status == False:
    st.error('Username/Password is incorrect')
    try:
        username_forgot_pw, email_forgot_password, random_password = authenticator.forgot_password('Forgot password')
        if username_forgot_pw:
            st.success('New password sent securely')
            # Random password to be transferred to the user securely
        elif username_forgot_pw == False:
            st.error('Username not found')
    except Exception as e:
        st.error(e)