import streamlit as st import pandas as pd import plotly.express as px import plotly.graph_objects as go from Eda_functions import * import numpy as np import re import pickle from ydata_profiling import ProfileReport from streamlit_pandas_profiling import st_profile_report import streamlit as st import streamlit.components.v1 as components import sweetviz as sv from utilities import set_header,initialize_data,load_local_css from st_aggrid import GridOptionsBuilder,GridUpdateMode from st_aggrid import GridOptionsBuilder from st_aggrid import AgGrid import base64 st.set_page_config( page_title="Data Validation", page_icon=":shark:", layout="wide", initial_sidebar_state='collapsed' ) load_local_css('styles.css') set_header() #preprocessing # with open('Categorised_data.pkl', 'rb') as file: # Categorised_data = pickle.load(file) # with open("edited_dataframe.pkl", 'rb') as file: # df = pickle.load(file) # date=df.index # df.reset_index(inplace=True) # df['date'] = pd.to_datetime(date) #prospects=pd.read_excel('EDA_Data.xlsx',sheet_name='Prospects') #spends=pd.read_excel('EDA_Data.xlsx',sheet_name='SPEND INPUT') #spends.columns=['Week','Streaming (Spends)','TV (Spends)','Search (Spends)','Digital (Spends)'] #df=pd.concat([df,spends],axis=1) #df['date'] =pd.to_datetime(df['date']).dt.strftime('%m/%d/%Y') #df['Prospects']=prospects['Prospects'] #df.drop(['Week'],axis=1,inplace=True) # Deserialize and load the objects from the pickle file # Deserialize and load the objects from the pickle file with open('data_import.pkl', 'rb') as f: data = pickle.load(f) # Accessing the loaded objects st.session_state['cleaned_data']= data['final_df'] st.session_state['category_dict'] = data['bin_dict'] st.title('Data Validation and Insights') # with open("Pickle_files/main_df",'rb') as f: # st.session_state['cleaned_data']= pickle.load(f) # with open("Pickle_files/category_dict",'rb') as c: # st.session_state['category_dict']=pickle.load(c) # st.write(st.session_state['cleaned_data']) target_variables=[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Response Metrics'] target_column = st.selectbox('Select the Target Feature/Dependent Variable (will be used in all charts as reference)',list(*target_variables)) st.session_state['target_column']=target_column fig=line_plot_target(st.session_state['cleaned_data'], target=target_column, title=f'{target_column} Over Time') st.plotly_chart(fig, use_container_width=True) media_channel=list(*[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Media']) # st.write(media_channel) Non_media_channel=[col for col in st.session_state['cleaned_data'].columns if col not in media_channel] st.markdown('### Annual Data Summary') st.dataframe(summary(st.session_state['cleaned_data'], media_channel+[target_column], spends=None,Target=True), use_container_width=True) if st.checkbox('Show raw data'): st.write(pd.concat([pd.to_datetime(st.session_state['cleaned_data']['date']).dt.strftime('%m/%d/%Y'),st.session_state['cleaned_data'].select_dtypes(np.number).applymap(format_numbers)],axis=1)) col1 = st.columns(1) if "selected_feature" not in st.session_state: st.session_state['selected_feature']=None st.header('Univariate and Bivariate Analysis') eda_columns=st.columns(2) with eda_columns[0]: if st.button('Generate Profile Report'): pr = st.session_state['cleaned_data'].profile_report() pr.to_file("Profile_Report.html") with open("Profile_Report.html", "rb") as f: profile_report_html = f.read() b64 = base64.b64encode(profile_report_html).decode() href = f'Download Profile Report' st.markdown(href, unsafe_allow_html=True) with eda_columns[1]: if st.button('Generate Sweetviz Report'): def generate_report_with_target(df, target_feature): report = sv.analyze([df, "Dataset"], target_feat=target_feature) return report report = generate_report_with_target(st.session_state['cleaned_data'], target_feature=target_column) report.show_html() st.warning('Work in Progress') # selected_media = st.selectbox('Select media', np.unique([Categorised_data[col]['VB'] for col in media_channel])) # # selected_feature=st.multiselect('Select Metric', df.columns[df.columns.str.contains(selected_media,case=False)]) # st.session_state["selected_feature"]=st.selectbox('Select Metric',[col for col in media_channel if Categorised_data[col]['VB'] in selected_media ] ) # spends_features=[col for col in df.columns if 'spends' in col.lower() or 'cost' in col.lower()] # spends_feature=[col for col in spends_features if col.split('_')[0] in st.session_state["selected_feature"].split('_')[0]] # #st.write(spends_features) # #st.write(spends_feature) # #st.write(selected_feature) # val_variables=[col for col in media_channel if col!='date'] # if len(spends_feature)==0: # st.warning('No spends varaible available for the selected metric in data') # else: # st.write(f'Selected spends variable {spends_feature[0]} if wrong please name the varaibles properly') # # Create the dual-axis line plot # fig_row1 = line_plot(df, x_col='date', y1_cols=[st.session_state["selected_feature"]], y2_cols=[target_column], title=f'Analysis of {st.session_state["selected_feature"]} and {[target_column][0]} Over Time') # st.plotly_chart(fig_row1, use_container_width=True) # st.markdown('### Annual Data Summary') # st.dataframe(summary(df,[st.session_state["selected_feature"]],spends=spends_feature[0]),use_container_width=True) # if st.button('Validate'): # st.session_state['Validation'].append(st.session_state["selected_feature"]) # if st.checkbox('Validate all'): # st.session_state['Validation'].extend(val_variables) # st.success('All media variables are validated ✅') # if len(set(st.session_state['Validation']).intersection(val_variables))!=len(val_variables): # #st.write(st.session_state['Validation']) # validation_data=pd.DataFrame({'Variables':val_variables, # 'Validated':[1 if col in st.session_state['Validation'] else 0 for col in val_variables], # 'Bucket':[Categorised_data[col]['VB'] for col in val_variables]}) # gd=GridOptionsBuilder.from_dataframe(validation_data) # gd.configure_pagination(enabled=True) # gd.configure_selection(use_checkbox=True,selection_mode='multiple') # #gd.configure_selection_toggle_all(None, show_toggle_all=True) # #gd.configure_columns_auto_size_mode(GridOptionsBuilder.configure_columns) # gridoptions=gd.build() # #st.text(st.session_state['Validation']) # table = AgGrid(validation_data,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED,fit_columns_on_grid_load=True) # #st.table(table) # selected_rows = table["selected_rows"] # st.session_state['Validation'].extend([col['Variables'] for col in selected_rows]) # not_validated_variables = [col for col in val_variables if col not in st.session_state["Validation"]] # if not_validated_variables: # not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}' # st.warning(not_validated_message) # st.header('2. Non Media Variables') # selected_columns_row = [col for col in df.columns if ("imp" not in col.lower()) and ('cli' not in col.lower() ) and ('spend' not in col.lower()) and col!='date'] # selected_columns_row4 = st.selectbox('Select Channel',selected_columns_row ) # if not selected_columns_row4: # st.warning('Please select at least one.') # else: # # Create the dual-axis line plot # fig_row4 = line_plot(df, x_col='date', y1_cols=[selected_columns_row4], y2_cols=[target_column], title=f'Analysis of {selected_columns_row4} and {target_column} Over Time') # st.plotly_chart(fig_row4, use_container_width=True) # selected_non_media=selected_columns_row4 # sum_df = df[['date', selected_non_media,target_column]] # sum_df['Year']=pd.to_datetime(df['date']).dt.year # #st.dataframe(df) # #st.dataframe(sum_df.head(2)) # sum_df=sum_df.groupby('Year').agg('sum') # sum_df.loc['Grand Total']=sum_df.sum() # sum_df=sum_df.applymap(format_numbers) # sum_df.fillna('-',inplace=True) # sum_df=sum_df.replace({"0.0":'-','nan':'-'}) # st.markdown('### Annual Data Summary') # st.dataframe(sum_df,use_container_width=True) # # if st.checkbox('Validate',key='2'): # # st.session_state['Validation'].append(selected_columns_row4) # # val_variables=[col for col in media_channel if col!='date'] # # if st.checkbox('Validate all'): # # st.session_state['Validation'].extend(val_variables) # # validation_data=pd.DataFrame({'Variables':val_variables, # # 'Validated':[1 if col in st.session_state['Validation'] else 0 for col in val_variables], # # 'Bucket':[Categorised_data[col]['VB'] for col in val_variables]}) # # gd=GridOptionsBuilder.from_dataframe(validation_data) # # gd.configure_pagination(enabled=True) # # gd.configure_selection(use_checkbox=True,selection_mode='multiple') # # #gd.configure_selection_toggle_all(None, show_toggle_all=True) # # #gd.configure_columns_auto_size_mode(GridOptionsBuilder.configure_columns) # # gridoptions=gd.build() # # #st.text(st.session_state['Validation']) # # table = AgGrid(validation_data,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED,fit_columns_on_grid_load=True) # # #st.table(table) # # selected_rows = table["selected_rows"] # # st.session_state['Validation'].extend([col['Variables'] for col in selected_rows]) # # not_validated_variables = [col for col in val_variables if col not in st.session_state["Validation"]] # # if not_validated_variables: # # not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}' # # st.warning(not_validated_message) # options = list(df.select_dtypes(np.number).columns) # st.markdown(' ') # st.markdown(' ') # st.markdown('# Exploratory Data Analysis') # st.markdown(' ') # selected_options = [] # num_columns = 4 # num_rows = -(-len(options) // num_columns) # Ceiling division to calculate rows # # Create a grid of checkboxes # st.header('Select Features for Correlation Plot') # tick=False # if st.checkbox('Select all'): # tick=True # selected_options = [] # for row in range(num_rows): # cols = st.columns(num_columns) # for col in cols: # if options: # option = options.pop(0) # selected = col.checkbox(option,value=tick) # if selected: # selected_options.append(option) # # Display selected options # #st.write('You selected:', selected_options) # st.pyplot(correlation_plot(df,selected_options,target_column))