Spaces:

BlendMMM
/

Mastercard

Sleeping

File size: 11,662 Bytes

import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from Eda_functions import *
import numpy as np
import pickle
from streamlit_pandas_profiling import st_profile_report
import streamlit as st
import streamlit.components.v1 as components
import sweetviz as sv
from utilities import set_header,load_local_css
from st_aggrid import GridOptionsBuilder,GridUpdateMode
from st_aggrid import GridOptionsBuilder
from st_aggrid import AgGrid
import base64
import os
import tempfile
#from ydata_profiling import ProfileReport
import re

st.set_page_config(
  page_title="Data Validation",
  page_icon=":shark:",
  layout="wide",
  initial_sidebar_state='collapsed'
)
load_local_css('styles.css')
set_header()





with open('data_import.pkl', 'rb') as f:
    data = pickle.load(f)
 
st.session_state['cleaned_data']= data['final_df']
st.session_state['category_dict'] = data['bin_dict']

st.title('Data Validation and Insights')


target_variables=[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Response Metrics']

target_column = st.selectbox('Select the Target Feature/Dependent Variable (will be used in all charts as reference)',list(*target_variables))
st.session_state['target_column']=target_column
panels=st.session_state['category_dict']['Panel Level 1'][0]
selected_panels=st.multiselect('Please choose the panels you wish to analyze.If no panels are selected, insights will be derived from the overall data.',st.session_state['cleaned_data'][panels].unique())
aggregation_dict = {item: 'sum' if key == 'Media' else 'mean' for key, value in st.session_state['category_dict'].items()  for item in value if item not in ['date','Panel_1']}

#st.write(st.session_state['cleaned_data'])

with st.expander('**Reponse Metric Analysis**'):
    
    if len(selected_panels)>0:
        st.session_state['Cleaned_data_panel']=st.session_state['cleaned_data'][st.session_state['cleaned_data']['Panel_1'].isin(selected_panels)]
 
        st.session_state['Cleaned_data_panel']=st.session_state['Cleaned_data_panel'].groupby(by='date').agg(aggregation_dict)
        st.session_state['Cleaned_data_panel']=st.session_state['Cleaned_data_panel'].reset_index()
    else:
        st.session_state['Cleaned_data_panel']=st.session_state['cleaned_data'].groupby(by='date').agg(aggregation_dict)
        st.session_state['Cleaned_data_panel']=st.session_state['Cleaned_data_panel'].reset_index()


    fig=line_plot_target(st.session_state['Cleaned_data_panel'], target=target_column, title=f'{target_column} Over Time')
    st.plotly_chart(fig, use_container_width=True)


    media_channel=list(*[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Media'])
    # st.write(media_channel)

    Non_media_variables=list(*[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Exogenous' or key=='Internal'])


    st.markdown('### Annual Data Summary')
    st.dataframe(summary(st.session_state['Cleaned_data_panel'], media_channel+[target_column], spends=None,Target=True), use_container_width=True)

    if st.checkbox('Show raw data'):
        st.write(pd.concat([pd.to_datetime(st.session_state['Cleaned_data_panel']['date']).dt.strftime('%m/%d/%Y'),st.session_state['Cleaned_data_panel'].select_dtypes(np.number).applymap(format_numbers)],axis=1))
col1 = st.columns(1)

if "selected_feature" not in st.session_state:
    st.session_state['selected_feature']=None


def generate_report_with_target(channel_data, target_feature):
        report = sv.analyze([channel_data, "Dataset"], target_feat=target_feature)
        temp_dir = tempfile.mkdtemp()
        report_path = os.path.join(temp_dir, "report.html")
        report.show_html(filepath=report_path, open_browser=False)  # Generate the report as an HTML file
        return report_path

def generate_profile_report(df):
    pr = df.profile_report()
    temp_dir = tempfile.mkdtemp()
    report_path = os.path.join(temp_dir, "report.html")
    pr.to_file(report_path)
    return report_path


#st.header()
with st.expander('Univariate and Bivariate Report'):
    eda_columns=st.columns(2)
    with eda_columns[0]:
        if st.button('Generate Profile Report',help='Univariate report which inlcudes all statistical analysis'):
            with st.spinner('Generating Report'):     
                report_file = generate_profile_report(st.session_state['Cleaned_data_panel'])

                if os.path.exists(report_file):
                    with open(report_file, 'rb') as f:
                        st.success('Report Generated')
                        st.download_button(
                            label="Download EDA Report",
                            data=f.read(),
                            file_name="pandas_profiling_report.html",
                            mime="text/html"
                        )
                else:
                    st.warning("Report generation failed. Unable to find the report file.")

with eda_columns[1]:
    if st.button('Generate Sweetviz Report',help='Bivariate report for selected response metric'):
       with st.spinner('Generating Report'):     
            report_file = generate_report_with_target(st.session_state['Cleaned_data_panel'], target_column)

            if os.path.exists(report_file):
                with open(report_file, 'rb') as f:
                    st.success('Report Generated')
                    st.download_button(
                        label="Download EDA Report",
                        data=f.read(),
                        file_name="report.html",
                        mime="text/html"
                    )
            else:
                st.warning("Report generation failed. Unable to find the report file.")
            


#st.warning('Work in Progress')
with st.expander('Media Variables Analysis'):
# Get the selected feature
    st.session_state["selected_feature"]= st.selectbox('Select media', [col for col in media_channel if 'cost' not in col.lower() and 'spend' not in col.lower()])

    # Filter spends features based on the selected feature
    spends_features = [col for col in st.session_state['Cleaned_data_panel'].columns if any(keyword in col.lower() for keyword in ['cost', 'spend'])]
    spends_feature = [col for col in spends_features if re.split(r'_cost|_spend', col.lower())[0] in st.session_state["selected_feature"]]

    if 'validation' not in st.session_state:
        st.session_state['validation']=[]


    val_variables=[col for col in media_channel if col!='date']
    if len(spends_feature)==0:  
        st.warning('No spends varaible available for the selected metric in data') 
        
    else:
        fig_row1 = line_plot(st.session_state['Cleaned_data_panel'], x_col='date', y1_cols=[st.session_state["selected_feature"]], y2_cols=[target_column], title=f'Analysis of {st.session_state["selected_feature"]} and {[target_column][0]} Over Time')
        st.plotly_chart(fig_row1, use_container_width=True)
        st.markdown('### Summary')
        st.dataframe(summary(st.session_state['cleaned_data'],[st.session_state["selected_feature"]],spends=spends_feature[0]),use_container_width=True)

        cols2=st.columns(2)
        with cols2[0]:
            if st.button('Validate'):
                st.session_state['validation'].append(st.session_state["selected_feature"])
        with cols2[1]:
            if st.checkbox('Validate all'):
                st.session_state['validation'].extend(val_variables)
                st.success('All media variables are validated ✅')

        if len(set(st.session_state['validation']).intersection(val_variables))!=len(val_variables):
            validation_data=pd.DataFrame({'Validate':[True if col in st.session_state['validation'] else False for col in val_variables],
                                        'Variables':val_variables
                                        })
            cols3=st.columns([1,30])
            with cols3[1]:
                validation_df=st.data_editor(validation_data,
                                            # column_config={
                                            # 'Validate':st.column_config.CheckboxColumn(wi)
                                            
                                            # },
                                            column_config={
                                                "Validate": st.column_config.CheckboxColumn(
                                                    default=False,
                                                    width=100,
                                                ),
                                                'Variables':st.column_config.TextColumn(
                                                    width=1000

                                                )
                                                },hide_index=True)

                selected_rows = validation_df[validation_df['Validate']==True]['Variables']

                #st.write(selected_rows)

                st.session_state['validation'].extend(selected_rows)

                not_validated_variables = [col for col in val_variables if col not in st.session_state["validation"]]
                if not_validated_variables: 
                    not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}'
                    st.warning(not_validated_message)



with st.expander('Non Media Variables Analysis'):
    selected_columns_row4 = st.selectbox('Select Channel',Non_media_variables,index=1)
    #     # Create the dual-axis line plot
    fig_row4 = line_plot(st.session_state['Cleaned_data_panel'], x_col='date', y1_cols=[selected_columns_row4], y2_cols=[target_column], title=f'Analysis of {selected_columns_row4} and {target_column} Over Time')
    st.plotly_chart(fig_row4, use_container_width=True)
    selected_non_media=selected_columns_row4
    sum_df = st.session_state['Cleaned_data_panel'][['date', selected_non_media,target_column]]
    sum_df['Year']=pd.to_datetime(st.session_state['Cleaned_data_panel']['date']).dt.year
    #st.dataframe(df)
    #st.dataframe(sum_df.head(2))
    sum_df=sum_df.groupby('Year').agg('sum')
    sum_df.loc['Grand Total']=sum_df.sum()         
    sum_df=sum_df.applymap(format_numbers) 
    sum_df.fillna('-',inplace=True)
    sum_df=sum_df.replace({"0.0":'-','nan':'-'})
    st.markdown('### Summary')    
    st.dataframe(sum_df,use_container_width=True)


with st.expander('Correlation Analysis'):
    options = list(st.session_state['Cleaned_data_panel'].select_dtypes(np.number).columns)

    # selected_options = []
    # num_columns = 4
    # num_rows = -(-len(options) // num_columns)  # Ceiling division to calculate rows

    # # Create a grid of checkboxes
    # st.header('Select Features for Correlation Plot')
    # tick=False
    # if st.checkbox('Select all'):
    #     tick=True
    # selected_options = []
    # for row in range(num_rows):
    #     cols = st.columns(num_columns)
    #     for col in cols:
    #         if options:
    #             option = options.pop(0) 
    #             selected = col.checkbox(option,value=tick)
    #             if selected:
    #                 selected_options.append(option)
    # # Display selected options

    selected_options=st.multiselect('Select Variables For correlation plot',[var for var in options if var!= target_column],default=options[3])

    st.pyplot(correlation_plot(st.session_state['Cleaned_data_panel'],selected_options,target_column))