Spaces:

BlendMMM
/

Mastercard

Sleeping

App Files Files Community

BlendMMM commited on Apr 16, 2024

Commit

1bc259c

verified ·

1 Parent(s): 4c45d80

Update pages/1_Data_Validation.py

Browse files

Files changed (1) hide show

pages/1_Data_Validation.py +187 -177

pages/1_Data_Validation.py CHANGED Viewed

@@ -4,18 +4,20 @@ import plotly.express as px
 import plotly.graph_objects as go
 from Eda_functions import *
 import numpy as np
-import re
 import pickle
-#from ydata_profiling import ProfileReport
 from streamlit_pandas_profiling import st_profile_report
 import streamlit as st
 import streamlit.components.v1 as components
 import sweetviz as sv
-from utilities import set_header,initialize_data,load_local_css
 from st_aggrid import GridOptionsBuilder,GridUpdateMode
 from st_aggrid import GridOptionsBuilder
 from st_aggrid import AgGrid
 import base64
 st.set_page_config(
   page_title="Data Validation",
@@ -28,155 +30,187 @@ set_header()
-#preprocessing
-# with open('Categorised_data.pkl', 'rb') as file:
-#   Categorised_data = pickle.load(file)
-# with open("edited_dataframe.pkl", 'rb') as file:
-# df = pickle.load(file)
-# date=df.index
-# df.reset_index(inplace=True)
-# df['Date'] = pd.to_datetime(date)
-#prospects=pd.read_excel('EDA_Data.xlsx',sheet_name='Prospects')
-#spends=pd.read_excel('EDA_Data.xlsx',sheet_name='SPEND INPUT')
-#spends.columns=['Week','Streaming (Spends)','TV (Spends)','Search (Spends)','Digital (Spends)']
-#df=pd.concat([df,spends],axis=1)
-#df['Date'] =pd.to_datetime(df['Date']).dt.strftime('%m/%d/%Y')
-#df['Prospects']=prospects['Prospects']
-#df.drop(['Week'],axis=1,inplace=True)
 st.title('Data Validation and Insights')
-with open("Pickle_files/main_df",'rb') as f:
-   st.session_state['cleaned_data']= pickle.load(f)
-with open("Pickle_files/category_dict",'rb') as c:
-   st.session_state['category_dict']=pickle.load(c)
-# st.write(st.session_state['cleaned_data'])
-target_variables=[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Response_Metric']
 target_column = st.selectbox('Select the Target Feature/Dependent Variable (will be used in all charts as reference)',list(*target_variables))
 st.session_state['target_column']=target_column
-fig=line_plot_target(st.session_state['cleaned_data'], target=target_column, title=f'{target_column} Over Time')
-st.plotly_chart(fig, use_container_width=True)
-media_channel=list(*[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Media'])
-# st.write(media_channel)
-Non_media_channel=[col for col in st.session_state['cleaned_data'].columns if col not in media_channel]
-st.markdown('### Annual Data Summary')
-st.dataframe(summary(st.session_state['cleaned_data'], media_channel+[target_column], spends=None,Target=True), use_container_width=True)
-if st.checkbox('Show raw data'):
-    st.write(pd.concat([pd.to_datetime(st.session_state['cleaned_data']['Date']).dt.strftime('%m/%d/%Y'),st.session_state['cleaned_data'].select_dtypes(np.number).applymap(format_numbers)],axis=1))
 col1 = st.columns(1)
 if "selected_feature" not in st.session_state:
     st.session_state['selected_feature']=None
-st.header('1. Media Channels')
-if 'Validation' not in st.session_state:
-    st.session_state['Validation']=[]
-eda_columns=st.columns(2)
-with eda_columns[0]:
-    if st.button('Generate Profile Report'):
-        pr = st.session_state['cleaned_data'].profile_report()
-        pr.to_file("Profile_Report.html")
-        with open("Profile_Report.html", "rb") as f:
-            profile_report_html = f.read()
-        b64 = base64.b64encode(profile_report_html).decode()
-        href = f'<a href="data:text/html;base64,{b64}" download="Profile_Report.html">Download Profile Report</a>'
-        st.markdown(href, unsafe_allow_html=True)
 with eda_columns[1]:
-    if st.button('Generate Sweetviz Report'):
-        def generate_report_with_target(df, target_feature):
-            report = sv.analyze([df, "Dataset"], target_feat=target_feature)
-            return report
-        report = generate_report_with_target(st.session_state['cleaned_data'], target_feature=target_column)
-        report.show_html()
-selected_media = st.selectbox('Select media', np.unique([Categorised_data[col]['VB'] for col in media_channel]))
-# selected_feature=st.multiselect('Select Metric', df.columns[df.columns.str.contains(selected_media,case=False)])
-st.session_state["selected_feature"]=st.selectbox('Select Metric',[col for col in  media_channel  if    Categorised_data[col]['VB'] in selected_media ] )
-spends_features=[col for col in df.columns if 'spends' in col.lower() or 'cost' in col.lower()]
-spends_feature=[col for col in spends_features if col.split('_')[0] in st.session_state["selected_feature"].split('_')[0]]
-#st.write(spends_features)
-#st.write(spends_feature)
-#st.write(selected_feature)
-val_variables=[col for col in media_channel if col!='Date']
-if len(spends_feature)==0:
-    st.warning('No spends varaible available for the selected metric in data')
-else:
-    st.write(f'Selected spends variable {spends_feature[0]} if wrong please name the varaibles properly')
-    # Create the dual-axis line plot
-    fig_row1 = line_plot(df, x_col='Date', y1_cols=[st.session_state["selected_feature"]], y2_cols=[target_column], title=f'Analysis of {st.session_state["selected_feature"]} and {[target_column][0]} Over Time')
-    st.plotly_chart(fig_row1, use_container_width=True)
-    st.markdown('### Annual Data Summary')
-    st.dataframe(summary(df,[st.session_state["selected_feature"]],spends=spends_feature[0]),use_container_width=True)
-    if st.button('Validate'):
-        st.session_state['Validation'].append(st.session_state["selected_feature"])
-    if st.checkbox('Validate all'):
-        st.session_state['Validation'].extend(val_variables)
-        st.success('All media variables are validated ✅')
-    if len(set(st.session_state['Validation']).intersection(val_variables))!=len(val_variables):
-        #st.write(st.session_state['Validation'])
-        validation_data=pd.DataFrame({'Variables':val_variables,
-                                    'Validated':[1 if col in st.session_state['Validation'] else 0 for col in val_variables],
-                                    'Bucket':[Categorised_data[col]['VB'] for col in val_variables]})
-        gd=GridOptionsBuilder.from_dataframe(validation_data)
-        gd.configure_pagination(enabled=True)
-        gd.configure_selection(use_checkbox=True,selection_mode='multiple')
-        #gd.configure_selection_toggle_all(None, show_toggle_all=True)
-        #gd.configure_columns_auto_size_mode(GridOptionsBuilder.configure_columns)
-        gridoptions=gd.build()
-        #st.text(st.session_state['Validation'])
-        table = AgGrid(validation_data,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED,fit_columns_on_grid_load=True)
-        #st.table(table)
-        selected_rows = table["selected_rows"]
-        st.session_state['Validation'].extend([col['Variables'] for col in selected_rows])
-        not_validated_variables = [col for col in val_variables if col not in st.session_state["Validation"]]
-        if not_validated_variables:
-            not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}'
-            st.warning(not_validated_message)
-st.header('2. Non Media Variables')
-selected_columns_row = [col for col in df.columns if ("imp" not in col.lower()) and ('cli' not in col.lower() ) and ('spend' not in col.lower()) and col!='Date']
-selected_columns_row4 = st.selectbox('Select Channel',selected_columns_row )
-if not selected_columns_row4:
-    st.warning('Please select at least one.')
-else:
-    # Create the dual-axis line plot
-    fig_row4 = line_plot(df, x_col='Date', y1_cols=[selected_columns_row4], y2_cols=[target_column], title=f'Analysis of {selected_columns_row4} and {target_column} Over Time')
     st.plotly_chart(fig_row4, use_container_width=True)
     selected_non_media=selected_columns_row4
-    sum_df = df[['Date', selected_non_media,target_column]]
-    sum_df['Year']=pd.to_datetime(df['Date']).dt.year
     #st.dataframe(df)
     #st.dataframe(sum_df.head(2))
     sum_df=sum_df.groupby('Year').agg('sum')
@@ -184,58 +218,34 @@ else:
     sum_df=sum_df.applymap(format_numbers)
     sum_df.fillna('-',inplace=True)
     sum_df=sum_df.replace({"0.0":'-','nan':'-'})
-    st.markdown('### Annual Data Summary')
     st.dataframe(sum_df,use_container_width=True)
-    # if st.checkbox('Validate',key='2'):
-    #     st.session_state['Validation'].append(selected_columns_row4)
-# val_variables=[col for col in media_channel if col!='Date']
-# if st.checkbox('Validate all'):
-#     st.session_state['Validation'].extend(val_variables)
-# validation_data=pd.DataFrame({'Variables':val_variables,
-#                             'Validated':[1 if col in st.session_state['Validation'] else 0 for col in val_variables],
-#                             'Bucket':[Categorised_data[col]['VB'] for col in val_variables]})
-# gd=GridOptionsBuilder.from_dataframe(validation_data)
-# gd.configure_pagination(enabled=True)
-# gd.configure_selection(use_checkbox=True,selection_mode='multiple')
-# #gd.configure_selection_toggle_all(None, show_toggle_all=True)
-# #gd.configure_columns_auto_size_mode(GridOptionsBuilder.configure_columns)
-# gridoptions=gd.build()
-# #st.text(st.session_state['Validation'])
-# table = AgGrid(validation_data,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED,fit_columns_on_grid_load=True)
-# #st.table(table)
-# selected_rows = table["selected_rows"]
-# st.session_state['Validation'].extend([col['Variables'] for col in selected_rows])
-# not_validated_variables = [col for col in val_variables if col not in st.session_state["Validation"]]
-# if not_validated_variables:
-#     not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}'
-#     st.warning(not_validated_message)
-options = list(df.select_dtypes(np.number).columns)
-st.markdown(' ')
-st.markdown(' ')
-st.markdown('# Exploratory Data Analysis')
-st.markdown(' ')
-selected_options = []
-num_columns = 4
-num_rows = -(-len(options) // num_columns)  # Ceiling division to calculate rows
-# Create a grid of checkboxes
-st.header('Select Features for Correlation Plot')
-tick=False
-if st.checkbox('Select all'):
-    tick=True
-selected_options = []
-for row in range(num_rows):
-    cols = st.columns(num_columns)
-    for col in cols:
-        if options:
-            option = options.pop(0)
-            selected = col.checkbox(option,value=tick)
-            if selected:
-                selected_options.append(option)
-# Display selected options
-#st.write('You selected:', selected_options)
-st.pyplot(correlation_plot(df,selected_options,target_column))

 import plotly.graph_objects as go
 from Eda_functions import *
 import numpy as np
 import pickle
 from streamlit_pandas_profiling import st_profile_report
 import streamlit as st
 import streamlit.components.v1 as components
 import sweetviz as sv
+from utilities import set_header,load_local_css
 from st_aggrid import GridOptionsBuilder,GridUpdateMode
 from st_aggrid import GridOptionsBuilder
 from st_aggrid import AgGrid
 import base64
+import os
+import tempfile
+from ydata_profiling import ProfileReport
+import re
 st.set_page_config(
   page_title="Data Validation",
+with open('data_import.pkl', 'rb') as f:
+    data = pickle.load(f)
+st.session_state['cleaned_data']= data['final_df']
+st.session_state['category_dict'] = data['bin_dict']
 st.title('Data Validation and Insights')
+target_variables=[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Response Metrics']
 target_column = st.selectbox('Select the Target Feature/Dependent Variable (will be used in all charts as reference)',list(*target_variables))
 st.session_state['target_column']=target_column
+panels=st.session_state['category_dict']['Panel Level 1'][0]
+selected_panels=st.multiselect('Please choose the panels you wish to analyze.If no panels are selected, insights will be derived from the overall data.',st.session_state['cleaned_data'][panels].unique())
+aggregation_dict = {item: 'sum' if key == 'Media' else 'mean' for key, value in st.session_state['category_dict'].items()  for item in value if item not in ['date','Panel_1']}
+with st.expander('**Reponse Metric Analysis**'):
+    if len(selected_panels)>0:
+        st.session_state['Cleaned_data_panel']=st.session_state['cleaned_data'][st.session_state['cleaned_data']['Panel_1'].isin(selected_panels)]
+        st.session_state['Cleaned_data_panel']=st.session_state['Cleaned_data_panel'].groupby(by='date').agg(aggregation_dict)
+        st.session_state['Cleaned_data_panel']=st.session_state['Cleaned_data_panel'].reset_index()
+    else:
+        st.session_state['Cleaned_data_panel']=st.session_state['cleaned_data'].groupby(by='date').agg(aggregation_dict)
+        st.session_state['Cleaned_data_panel']=st.session_state['Cleaned_data_panel'].reset_index()
+    fig=line_plot_target(st.session_state['Cleaned_data_panel'], target=target_column, title=f'{target_column} Over Time')
+    st.plotly_chart(fig, use_container_width=True)
+    media_channel=list(*[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Media'])
+    # st.write(media_channel)
+    Non_media_variables=list(*[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Exogenous' or key=='Internal'])
+    st.markdown('### Annual Data Summary')
+    st.dataframe(summary(st.session_state['Cleaned_data_panel'], media_channel+[target_column], spends=None,Target=True), use_container_width=True)
+    if st.checkbox('Show raw data'):
+        st.write(pd.concat([pd.to_datetime(st.session_state['Cleaned_data_panel']['date']).dt.strftime('%m/%d/%Y'),st.session_state['Cleaned_data_panel'].select_dtypes(np.number).applymap(format_numbers)],axis=1))
 col1 = st.columns(1)
 if "selected_feature" not in st.session_state:
     st.session_state['selected_feature']=None
+def generate_report_with_target(channel_data, target_feature):
+        report = sv.analyze([channel_data, "Dataset"], target_feat=target_feature)
+        temp_dir = tempfile.mkdtemp()
+        report_path = os.path.join(temp_dir, "report.html")
+        report.show_html(filepath=report_path, open_browser=False)  # Generate the report as an HTML file
+        return report_path
+def generate_profile_report(df):
+    pr = df.profile_report()
+    temp_dir = tempfile.mkdtemp()
+    report_path = os.path.join(temp_dir, "report.html")
+    pr.to_file(report_path)
+    return report_path
+#st.header()
+with st.expander('Univariate and Bivariate Report'):
+    eda_columns=st.columns(2)
+    with eda_columns[0]:
+        if st.button('Generate Profile Report',help='Univariate report which inlcudes all statistical analysis'):
+            with st.spinner('Generating Report'):
+                report_file = generate_profile_report(st.session_state['Cleaned_data_panel'])
+                if os.path.exists(report_file):
+                    with open(report_file, 'rb') as f:
+                        st.success('Report Generated')
+                        st.download_button(
+                            label="Download EDA Report",
+                            data=f.read(),
+                            file_name="pandas_profiling_report.html",
+                            mime="text/html"
+                        )
+                else:
+                    st.warning("Report generation failed. Unable to find the report file.")
 with eda_columns[1]:
+    if st.button('Generate Sweetviz Report',help='Bivariate report for selected response metric'):
+       with st.spinner('Generating Report'):
+            report_file = generate_report_with_target(st.session_state['Cleaned_data_panel'], target_column)
+            if os.path.exists(report_file):
+                with open(report_file, 'rb') as f:
+                    st.success('Report Generated')
+                    st.download_button(
+                        label="Download EDA Report",
+                        data=f.read(),
+                        file_name="report.html",
+                        mime="text/html"
+                    )
+            else:
+                st.warning("Report generation failed. Unable to find the report file.")
+#st.warning('Work in Progress')
+with st.expander('Media Variables Analysis'):
+# Get the selected feature
+    st.session_state["selected_feature"]= st.selectbox('Select media', [col for col in media_channel if 'cost' not in col.lower() and 'spend' not in col.lower()])
+    # Filter spends features based on the selected feature
+    spends_features = [col for col in st.session_state['Cleaned_data_panel'].columns if any(keyword in col.lower() for keyword in ['cost', 'spend'])]
+    spends_feature = [col for col in spends_features if re.split(r'_cost|_spend', col.lower())[0] in st.session_state["selected_feature"]]
+    if 'validation' not in st.session_state:
+        st.session_state['validation']=[]
+    val_variables=[col for col in media_channel if col!='date']
+    if len(spends_feature)==0:
+        st.warning('No spends varaible available for the selected metric in data')
+    else:
+        fig_row1 = line_plot(st.session_state['Cleaned_data_panel'], x_col='date', y1_cols=[st.session_state["selected_feature"]], y2_cols=[target_column], title=f'Analysis of {st.session_state["selected_feature"]} and {[target_column][0]} Over Time')
+        st.plotly_chart(fig_row1, use_container_width=True)
+        st.markdown('### Summary')
+        st.dataframe(summary(st.session_state['cleaned_data'],[st.session_state["selected_feature"]],spends=spends_feature[0]),use_container_width=True)
+        cols2=st.columns(2)
+        with cols2[0]:
+            if st.button('Validate'):
+                st.session_state['validation'].append(st.session_state["selected_feature"])
+        with cols2[1]:
+            if st.checkbox('Validate all'):
+                st.session_state['validation'].extend(val_variables)
+                st.success('All media variables are validated ✅')
+        if len(set(st.session_state['validation']).intersection(val_variables))!=len(val_variables):
+            validation_data=pd.DataFrame({'Validate':[True if col in st.session_state['validation'] else False for col in val_variables],
+                                        'Variables':val_variables
+                                        })
+            cols3=st.columns([1,30])
+            with cols3[1]:
+                validation_df=st.data_editor(validation_data,
+                                            # column_config={
+                                            # 'Validate':st.column_config.CheckboxColumn(wi)
+                                            # },
+                                            column_config={
+                                                "Validate": st.column_config.CheckboxColumn(
+                                                    default=False,
+                                                    width=100,
+                                                ),
+                                                'Variables':st.column_config.TextColumn(
+                                                    width=1000
+                                                )
+                                                },hide_index=True)
+                selected_rows = validation_df[validation_df['Validate']==True]['Variables']
+                #st.write(selected_rows)
+                st.session_state['validation'].extend(selected_rows)
+                not_validated_variables = [col for col in val_variables if col not in st.session_state["validation"]]
+                if not_validated_variables:
+                    not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}'
+                    st.warning(not_validated_message)
+with st.expander('Non Media Variables Analysis'):
+    selected_columns_row4 = st.selectbox('Select Channel',Non_media_variables,index=1)
+    #     # Create the dual-axis line plot
+    fig_row4 = line_plot(st.session_state['Cleaned_data_panel'], x_col='date', y1_cols=[selected_columns_row4], y2_cols=[target_column], title=f'Analysis of {selected_columns_row4} and {target_column} Over Time')
     st.plotly_chart(fig_row4, use_container_width=True)
     selected_non_media=selected_columns_row4
+    sum_df = st.session_state['Cleaned_data_panel'][['date', selected_non_media,target_column]]
+    sum_df['Year']=pd.to_datetime(st.session_state['Cleaned_data_panel']['date']).dt.year
     #st.dataframe(df)
     #st.dataframe(sum_df.head(2))
     sum_df=sum_df.groupby('Year').agg('sum')
     sum_df=sum_df.applymap(format_numbers)
     sum_df.fillna('-',inplace=True)
     sum_df=sum_df.replace({"0.0":'-','nan':'-'})
+    st.markdown('### Summary')
     st.dataframe(sum_df,use_container_width=True)
+with st.expander('Correlation Analysis'):
+    options = list(st.session_state['Cleaned_data_panel'].select_dtypes(np.number).columns)
+    # selected_options = []
+    # num_columns = 4
+    # num_rows = -(-len(options) // num_columns)  # Ceiling division to calculate rows
+    # # Create a grid of checkboxes
+    # st.header('Select Features for Correlation Plot')
+    # tick=False
+    # if st.checkbox('Select all'):
+    #     tick=True
+    # selected_options = []
+    # for row in range(num_rows):
+    #     cols = st.columns(num_columns)
+    #     for col in cols:
+    #         if options:
+    #             option = options.pop(0)
+    #             selected = col.checkbox(option,value=tick)
+    #             if selected:
+    #                 selected_options.append(option)
+    # # Display selected options
+    selected_options=st.multiselect('Select Variables For correlation plot',[var for var in options if var!= target_column],default=options[3])
+    st.pyplot(correlation_plot(st.session_state['Cleaned_data_panel'],selected_options,target_column))