BlendMMM commited on
Commit
e4713be
·
verified ·
1 Parent(s): 6132ee9

Upload 1_Data_Validation.py

Browse files
Files changed (1) hide show
  1. pages/1_Data_Validation.py +141 -132
pages/1_Data_Validation.py CHANGED
@@ -6,6 +6,7 @@ from Eda_functions import *
6
  import numpy as np
7
  import re
8
  import pickle
 
9
  from streamlit_pandas_profiling import st_profile_report
10
  import streamlit as st
11
  import streamlit.components.v1 as components
@@ -36,7 +37,7 @@ set_header()
36
  # df = pickle.load(file)
37
  # date=df.index
38
  # df.reset_index(inplace=True)
39
- # df['Date'] = pd.to_datetime(date)
40
 
41
 
42
  #prospects=pd.read_excel('EDA_Data.xlsx',sheet_name='Prospects')
@@ -44,21 +45,30 @@ set_header()
44
  #spends.columns=['Week','Streaming (Spends)','TV (Spends)','Search (Spends)','Digital (Spends)']
45
  #df=pd.concat([df,spends],axis=1)
46
 
47
- #df['Date'] =pd.to_datetime(df['Date']).dt.strftime('%m/%d/%Y')
48
  #df['Prospects']=prospects['Prospects']
49
  #df.drop(['Week'],axis=1,inplace=True)
50
 
 
 
 
 
 
 
 
 
51
 
52
  st.title('Data Validation and Insights')
53
 
54
- with open("Pickle_files/main_df",'rb') as f:
55
- st.session_state['cleaned_data']= pickle.load(f)
56
- with open("Pickle_files/category_dict",'rb') as c:
57
- st.session_state['category_dict']=pickle.load(c)
 
58
 
59
  # st.write(st.session_state['cleaned_data'])
60
 
61
- target_variables=[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Response_Metric']
62
 
63
 
64
  target_column = st.selectbox('Select the Target Feature/Dependent Variable (will be used in all charts as reference)',list(*target_variables))
@@ -79,17 +89,14 @@ st.markdown('### Annual Data Summary')
79
  st.dataframe(summary(st.session_state['cleaned_data'], media_channel+[target_column], spends=None,Target=True), use_container_width=True)
80
 
81
  if st.checkbox('Show raw data'):
82
- st.write(pd.concat([pd.to_datetime(st.session_state['cleaned_data']['Date']).dt.strftime('%m/%d/%Y'),st.session_state['cleaned_data'].select_dtypes(np.number).applymap(format_numbers)],axis=1))
83
  col1 = st.columns(1)
84
 
85
  if "selected_feature" not in st.session_state:
86
  st.session_state['selected_feature']=None
87
 
88
- st.header('1. Media Channels')
89
-
90
- if 'Validation' not in st.session_state:
91
- st.session_state['Validation']=[]
92
 
 
93
  eda_columns=st.columns(2)
94
  with eda_columns[0]:
95
  if st.button('Generate Profile Report'):
@@ -114,127 +121,129 @@ with eda_columns[1]:
114
  report.show_html()
115
 
116
 
117
- selected_media = st.selectbox('Select media', np.unique([Categorised_data[col]['VB'] for col in media_channel]))
118
- # selected_feature=st.multiselect('Select Metric', df.columns[df.columns.str.contains(selected_media,case=False)])
119
- st.session_state["selected_feature"]=st.selectbox('Select Metric',[col for col in media_channel if Categorised_data[col]['VB'] in selected_media ] )
120
- spends_features=[col for col in df.columns if 'spends' in col.lower() or 'cost' in col.lower()]
121
- spends_feature=[col for col in spends_features if col.split('_')[0] in st.session_state["selected_feature"].split('_')[0]]
122
- #st.write(spends_features)
123
- #st.write(spends_feature)
124
- #st.write(selected_feature)
 
 
125
 
126
 
127
- val_variables=[col for col in media_channel if col!='Date']
128
- if len(spends_feature)==0:
129
- st.warning('No spends varaible available for the selected metric in data')
130
 
131
- else:
132
- st.write(f'Selected spends variable {spends_feature[0]} if wrong please name the varaibles properly')
133
- # Create the dual-axis line plot
134
- fig_row1 = line_plot(df, x_col='Date', y1_cols=[st.session_state["selected_feature"]], y2_cols=[target_column], title=f'Analysis of {st.session_state["selected_feature"]} and {[target_column][0]} Over Time')
135
- st.plotly_chart(fig_row1, use_container_width=True)
136
- st.markdown('### Annual Data Summary')
137
- st.dataframe(summary(df,[st.session_state["selected_feature"]],spends=spends_feature[0]),use_container_width=True)
138
- if st.button('Validate'):
139
- st.session_state['Validation'].append(st.session_state["selected_feature"])
140
-
141
- if st.checkbox('Validate all'):
142
- st.session_state['Validation'].extend(val_variables)
143
- st.success('All media variables are validated ✅')
144
- if len(set(st.session_state['Validation']).intersection(val_variables))!=len(val_variables):
145
- #st.write(st.session_state['Validation'])
146
- validation_data=pd.DataFrame({'Variables':val_variables,
147
- 'Validated':[1 if col in st.session_state['Validation'] else 0 for col in val_variables],
148
- 'Bucket':[Categorised_data[col]['VB'] for col in val_variables]})
149
- gd=GridOptionsBuilder.from_dataframe(validation_data)
150
- gd.configure_pagination(enabled=True)
151
- gd.configure_selection(use_checkbox=True,selection_mode='multiple')
152
- #gd.configure_selection_toggle_all(None, show_toggle_all=True)
153
- #gd.configure_columns_auto_size_mode(GridOptionsBuilder.configure_columns)
154
- gridoptions=gd.build()
155
- #st.text(st.session_state['Validation'])
156
- table = AgGrid(validation_data,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED,fit_columns_on_grid_load=True)
157
- #st.table(table)
158
- selected_rows = table["selected_rows"]
159
- st.session_state['Validation'].extend([col['Variables'] for col in selected_rows])
160
- not_validated_variables = [col for col in val_variables if col not in st.session_state["Validation"]]
161
- if not_validated_variables:
162
- not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}'
163
- st.warning(not_validated_message)
164
-
165
-
166
-
167
- st.header('2. Non Media Variables')
168
- selected_columns_row = [col for col in df.columns if ("imp" not in col.lower()) and ('cli' not in col.lower() ) and ('spend' not in col.lower()) and col!='Date']
169
- selected_columns_row4 = st.selectbox('Select Channel',selected_columns_row )
170
- if not selected_columns_row4:
171
- st.warning('Please select at least one.')
172
- else:
173
- # Create the dual-axis line plot
174
- fig_row4 = line_plot(df, x_col='Date', y1_cols=[selected_columns_row4], y2_cols=[target_column], title=f'Analysis of {selected_columns_row4} and {target_column} Over Time')
175
- st.plotly_chart(fig_row4, use_container_width=True)
176
- selected_non_media=selected_columns_row4
177
- sum_df = df[['Date', selected_non_media,target_column]]
178
- sum_df['Year']=pd.to_datetime(df['Date']).dt.year
179
- #st.dataframe(df)
180
- #st.dataframe(sum_df.head(2))
181
- sum_df=sum_df.groupby('Year').agg('sum')
182
- sum_df.loc['Grand Total']=sum_df.sum()
183
- sum_df=sum_df.applymap(format_numbers)
184
- sum_df.fillna('-',inplace=True)
185
- sum_df=sum_df.replace({"0.0":'-','nan':'-'})
186
- st.markdown('### Annual Data Summary')
187
- st.dataframe(sum_df,use_container_width=True)
188
-
189
- # if st.checkbox('Validate',key='2'):
190
- # st.session_state['Validation'].append(selected_columns_row4)
191
- # val_variables=[col for col in media_channel if col!='Date']
192
- # if st.checkbox('Validate all'):
193
- # st.session_state['Validation'].extend(val_variables)
194
- # validation_data=pd.DataFrame({'Variables':val_variables,
195
- # 'Validated':[1 if col in st.session_state['Validation'] else 0 for col in val_variables],
196
- # 'Bucket':[Categorised_data[col]['VB'] for col in val_variables]})
197
- # gd=GridOptionsBuilder.from_dataframe(validation_data)
198
- # gd.configure_pagination(enabled=True)
199
- # gd.configure_selection(use_checkbox=True,selection_mode='multiple')
200
- # #gd.configure_selection_toggle_all(None, show_toggle_all=True)
201
- # #gd.configure_columns_auto_size_mode(GridOptionsBuilder.configure_columns)
202
- # gridoptions=gd.build()
203
- # #st.text(st.session_state['Validation'])
204
- # table = AgGrid(validation_data,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED,fit_columns_on_grid_load=True)
205
- # #st.table(table)
206
- # selected_rows = table["selected_rows"]
207
- # st.session_state['Validation'].extend([col['Variables'] for col in selected_rows])
208
- # not_validated_variables = [col for col in val_variables if col not in st.session_state["Validation"]]
209
- # if not_validated_variables:
210
- # not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}'
211
- # st.warning(not_validated_message)
212
-
213
- options = list(df.select_dtypes(np.number).columns)
214
- st.markdown(' ')
215
- st.markdown(' ')
216
- st.markdown('# Exploratory Data Analysis')
217
- st.markdown(' ')
218
-
219
- selected_options = []
220
- num_columns = 4
221
- num_rows = -(-len(options) // num_columns) # Ceiling division to calculate rows
222
-
223
- # Create a grid of checkboxes
224
- st.header('Select Features for Correlation Plot')
225
- tick=False
226
- if st.checkbox('Select all'):
227
- tick=True
228
- selected_options = []
229
- for row in range(num_rows):
230
- cols = st.columns(num_columns)
231
- for col in cols:
232
- if options:
233
- option = options.pop(0)
234
- selected = col.checkbox(option,value=tick)
235
- if selected:
236
- selected_options.append(option)
237
- # Display selected options
238
- #st.write('You selected:', selected_options)
239
- st.pyplot(correlation_plot(df,selected_options,target_column))
240
 
 
6
  import numpy as np
7
  import re
8
  import pickle
9
+ from ydata_profiling import ProfileReport
10
  from streamlit_pandas_profiling import st_profile_report
11
  import streamlit as st
12
  import streamlit.components.v1 as components
 
37
  # df = pickle.load(file)
38
  # date=df.index
39
  # df.reset_index(inplace=True)
40
+ # df['date'] = pd.to_datetime(date)
41
 
42
 
43
  #prospects=pd.read_excel('EDA_Data.xlsx',sheet_name='Prospects')
 
45
  #spends.columns=['Week','Streaming (Spends)','TV (Spends)','Search (Spends)','Digital (Spends)']
46
  #df=pd.concat([df,spends],axis=1)
47
 
48
+ #df['date'] =pd.to_datetime(df['date']).dt.strftime('%m/%d/%Y')
49
  #df['Prospects']=prospects['Prospects']
50
  #df.drop(['Week'],axis=1,inplace=True)
51
 
52
+ # Deserialize and load the objects from the pickle file
53
+ # Deserialize and load the objects from the pickle file
54
+ with open('data_import.pkl', 'rb') as f:
55
+ data = pickle.load(f)
56
+
57
+ # Accessing the loaded objects
58
+ st.session_state['cleaned_data']= data['final_df']
59
+ st.session_state['category_dict'] = data['bin_dict']
60
 
61
  st.title('Data Validation and Insights')
62
 
63
+
64
+ # with open("Pickle_files/main_df",'rb') as f:
65
+ # st.session_state['cleaned_data']= pickle.load(f)
66
+ # with open("Pickle_files/category_dict",'rb') as c:
67
+ # st.session_state['category_dict']=pickle.load(c)
68
 
69
  # st.write(st.session_state['cleaned_data'])
70
 
71
+ target_variables=[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Response Metrics']
72
 
73
 
74
  target_column = st.selectbox('Select the Target Feature/Dependent Variable (will be used in all charts as reference)',list(*target_variables))
 
89
  st.dataframe(summary(st.session_state['cleaned_data'], media_channel+[target_column], spends=None,Target=True), use_container_width=True)
90
 
91
  if st.checkbox('Show raw data'):
92
+ st.write(pd.concat([pd.to_datetime(st.session_state['cleaned_data']['date']).dt.strftime('%m/%d/%Y'),st.session_state['cleaned_data'].select_dtypes(np.number).applymap(format_numbers)],axis=1))
93
  col1 = st.columns(1)
94
 
95
  if "selected_feature" not in st.session_state:
96
  st.session_state['selected_feature']=None
97
 
 
 
 
 
98
 
99
+ st.header('Univariate and Bivariate Analysis')
100
  eda_columns=st.columns(2)
101
  with eda_columns[0]:
102
  if st.button('Generate Profile Report'):
 
121
  report.show_html()
122
 
123
 
124
+ st.warning('Work in Progress')
125
+
126
+ # selected_media = st.selectbox('Select media', np.unique([Categorised_data[col]['VB'] for col in media_channel]))
127
+ # # selected_feature=st.multiselect('Select Metric', df.columns[df.columns.str.contains(selected_media,case=False)])
128
+ # st.session_state["selected_feature"]=st.selectbox('Select Metric',[col for col in media_channel if Categorised_data[col]['VB'] in selected_media ] )
129
+ # spends_features=[col for col in df.columns if 'spends' in col.lower() or 'cost' in col.lower()]
130
+ # spends_feature=[col for col in spends_features if col.split('_')[0] in st.session_state["selected_feature"].split('_')[0]]
131
+ # #st.write(spends_features)
132
+ # #st.write(spends_feature)
133
+ # #st.write(selected_feature)
134
 
135
 
136
+ # val_variables=[col for col in media_channel if col!='date']
137
+ # if len(spends_feature)==0:
138
+ # st.warning('No spends varaible available for the selected metric in data')
139
 
140
+ # else:
141
+ # st.write(f'Selected spends variable {spends_feature[0]} if wrong please name the varaibles properly')
142
+ # # Create the dual-axis line plot
143
+ # fig_row1 = line_plot(df, x_col='date', y1_cols=[st.session_state["selected_feature"]], y2_cols=[target_column], title=f'Analysis of {st.session_state["selected_feature"]} and {[target_column][0]} Over Time')
144
+ # st.plotly_chart(fig_row1, use_container_width=True)
145
+ # st.markdown('### Annual Data Summary')
146
+ # st.dataframe(summary(df,[st.session_state["selected_feature"]],spends=spends_feature[0]),use_container_width=True)
147
+ # if st.button('Validate'):
148
+ # st.session_state['Validation'].append(st.session_state["selected_feature"])
149
+
150
+ # if st.checkbox('Validate all'):
151
+ # st.session_state['Validation'].extend(val_variables)
152
+ # st.success('All media variables are validated ✅')
153
+ # if len(set(st.session_state['Validation']).intersection(val_variables))!=len(val_variables):
154
+ # #st.write(st.session_state['Validation'])
155
+ # validation_data=pd.DataFrame({'Variables':val_variables,
156
+ # 'Validated':[1 if col in st.session_state['Validation'] else 0 for col in val_variables],
157
+ # 'Bucket':[Categorised_data[col]['VB'] for col in val_variables]})
158
+ # gd=GridOptionsBuilder.from_dataframe(validation_data)
159
+ # gd.configure_pagination(enabled=True)
160
+ # gd.configure_selection(use_checkbox=True,selection_mode='multiple')
161
+ # #gd.configure_selection_toggle_all(None, show_toggle_all=True)
162
+ # #gd.configure_columns_auto_size_mode(GridOptionsBuilder.configure_columns)
163
+ # gridoptions=gd.build()
164
+ # #st.text(st.session_state['Validation'])
165
+ # table = AgGrid(validation_data,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED,fit_columns_on_grid_load=True)
166
+ # #st.table(table)
167
+ # selected_rows = table["selected_rows"]
168
+ # st.session_state['Validation'].extend([col['Variables'] for col in selected_rows])
169
+ # not_validated_variables = [col for col in val_variables if col not in st.session_state["Validation"]]
170
+ # if not_validated_variables:
171
+ # not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}'
172
+ # st.warning(not_validated_message)
173
+
174
+
175
+
176
+ # st.header('2. Non Media Variables')
177
+ # selected_columns_row = [col for col in df.columns if ("imp" not in col.lower()) and ('cli' not in col.lower() ) and ('spend' not in col.lower()) and col!='date']
178
+ # selected_columns_row4 = st.selectbox('Select Channel',selected_columns_row )
179
+ # if not selected_columns_row4:
180
+ # st.warning('Please select at least one.')
181
+ # else:
182
+ # # Create the dual-axis line plot
183
+ # fig_row4 = line_plot(df, x_col='date', y1_cols=[selected_columns_row4], y2_cols=[target_column], title=f'Analysis of {selected_columns_row4} and {target_column} Over Time')
184
+ # st.plotly_chart(fig_row4, use_container_width=True)
185
+ # selected_non_media=selected_columns_row4
186
+ # sum_df = df[['date', selected_non_media,target_column]]
187
+ # sum_df['Year']=pd.to_datetime(df['date']).dt.year
188
+ # #st.dataframe(df)
189
+ # #st.dataframe(sum_df.head(2))
190
+ # sum_df=sum_df.groupby('Year').agg('sum')
191
+ # sum_df.loc['Grand Total']=sum_df.sum()
192
+ # sum_df=sum_df.applymap(format_numbers)
193
+ # sum_df.fillna('-',inplace=True)
194
+ # sum_df=sum_df.replace({"0.0":'-','nan':'-'})
195
+ # st.markdown('### Annual Data Summary')
196
+ # st.dataframe(sum_df,use_container_width=True)
197
+
198
+ # # if st.checkbox('Validate',key='2'):
199
+ # # st.session_state['Validation'].append(selected_columns_row4)
200
+ # # val_variables=[col for col in media_channel if col!='date']
201
+ # # if st.checkbox('Validate all'):
202
+ # # st.session_state['Validation'].extend(val_variables)
203
+ # # validation_data=pd.DataFrame({'Variables':val_variables,
204
+ # # 'Validated':[1 if col in st.session_state['Validation'] else 0 for col in val_variables],
205
+ # # 'Bucket':[Categorised_data[col]['VB'] for col in val_variables]})
206
+ # # gd=GridOptionsBuilder.from_dataframe(validation_data)
207
+ # # gd.configure_pagination(enabled=True)
208
+ # # gd.configure_selection(use_checkbox=True,selection_mode='multiple')
209
+ # # #gd.configure_selection_toggle_all(None, show_toggle_all=True)
210
+ # # #gd.configure_columns_auto_size_mode(GridOptionsBuilder.configure_columns)
211
+ # # gridoptions=gd.build()
212
+ # # #st.text(st.session_state['Validation'])
213
+ # # table = AgGrid(validation_data,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED,fit_columns_on_grid_load=True)
214
+ # # #st.table(table)
215
+ # # selected_rows = table["selected_rows"]
216
+ # # st.session_state['Validation'].extend([col['Variables'] for col in selected_rows])
217
+ # # not_validated_variables = [col for col in val_variables if col not in st.session_state["Validation"]]
218
+ # # if not_validated_variables:
219
+ # # not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}'
220
+ # # st.warning(not_validated_message)
221
+
222
+ # options = list(df.select_dtypes(np.number).columns)
223
+ # st.markdown(' ')
224
+ # st.markdown(' ')
225
+ # st.markdown('# Exploratory Data Analysis')
226
+ # st.markdown(' ')
227
+
228
+ # selected_options = []
229
+ # num_columns = 4
230
+ # num_rows = -(-len(options) // num_columns) # Ceiling division to calculate rows
231
+
232
+ # # Create a grid of checkboxes
233
+ # st.header('Select Features for Correlation Plot')
234
+ # tick=False
235
+ # if st.checkbox('Select all'):
236
+ # tick=True
237
+ # selected_options = []
238
+ # for row in range(num_rows):
239
+ # cols = st.columns(num_columns)
240
+ # for col in cols:
241
+ # if options:
242
+ # option = options.pop(0)
243
+ # selected = col.checkbox(option,value=tick)
244
+ # if selected:
245
+ # selected_options.append(option)
246
+ # # Display selected options
247
+ # #st.write('You selected:', selected_options)
248
+ # st.pyplot(correlation_plot(df,selected_options,target_column))
249