BlendMMM commited on
Commit
6641078
·
verified ·
1 Parent(s): 56ff7b3

Upload Data_Import.py

Browse files
Files changed (1) hide show
  1. Data_Import.py +825 -318
Data_Import.py CHANGED
@@ -1,6 +1,5 @@
1
  # Importing necessary libraries
2
  import streamlit as st
3
- import pickle
4
 
5
  st.set_page_config(
6
  page_title="Model Build",
@@ -9,376 +8,884 @@ st.set_page_config(
9
  initial_sidebar_state="collapsed",
10
  )
11
 
12
- from utilities import load_authenticator
13
  import numpy as np
14
  import pandas as pd
15
- from utilities import set_header, load_local_css
 
 
16
 
17
  load_local_css("styles.css")
18
  set_header()
19
 
20
-
21
- for k, v in st.session_state.items():
22
- if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
23
- st.session_state[k] = v
24
-
25
- authenticator = st.session_state.get('authenticator')
26
  if authenticator is None:
27
  authenticator = load_authenticator()
28
 
29
- name, authentication_status, username = authenticator.login('Login', 'main')
30
- auth_status = st.session_state.get('authentication_status')
31
-
32
- if auth_status == True:
33
- is_state_initiaized = st.session_state.get('initialized',False)
34
- if not is_state_initiaized:
35
- a=1
36
-
37
-
38
- # Function to expand dataframe to daily
39
- @st.cache_resource(show_spinner=False)
40
- def expand_to_daily(df, granularity, start_date, end_date):
41
- # Create a new DataFrame with a row for each day
42
- all_dates = pd.date_range(start=start_date, end=end_date, freq="D")
43
- daily_df = pd.DataFrame(all_dates, columns=["Date"])
44
-
45
- if granularity == "daily":
46
- # For daily data, simply merge to fill missing dates
47
- daily_df = daily_df.merge(df, on="Date", how="left")
48
- else:
49
- # For weekly or monthly, distribute values to daily rows
50
- for column in df.columns:
51
- if column != "Date": # Skip 'Date' column
52
- daily_df[column] = np.nan # Initialize with NaNs
53
-
54
- # Group by the required frequency and distribute values
55
- freq = "W-MON" if granularity == "weekly" else "MS"
56
- for _, group in df.groupby(pd.Grouper(key="Date", freq=freq)):
57
- num_days = len(
58
- pd.date_range(group["Date"].min(), group["Date"].max(), freq="D")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  )
60
- for column in group.columns:
61
- if column == "Date": # Skip 'Date' column
62
- continue
63
- value = group[column].sum() / num_days
64
- date_range = pd.date_range(
65
- group["Date"].min(), periods=num_days, freq="D"
66
- )
67
- daily_df.loc[daily_df["Date"].isin(date_range), column] = value
68
-
69
- return daily_df
70
-
71
-
72
- # Function to validate date column in dataframe
73
- def validate_date_column(df):
74
- try:
75
- # Attempt to convert the 'Date' column to datetime
76
- df["Date"] = pd.to_datetime(df["Date"], format="%d-%m-%Y")
77
- return True
78
- except:
79
- return False
80
-
81
-
82
- # Function to determine data interval
83
- def determine_data_interval(common_freq):
84
- if common_freq == 1:
85
- return "daily"
86
- elif common_freq == 7:
87
- return "weekly"
88
- elif 28 <= common_freq <= 31:
89
- return "monthly"
90
- else:
91
- return "irregular"
92
-
93
-
94
- # Function to convert and fill dates in dataframe
95
- def convert_and_fill_dates(df, start_date, end_date, interval):
96
- # Create a date range for the desired period
97
- all_dates = pd.date_range(start=start_date, end=end_date, freq="D")
98
- new_df = pd.DataFrame(all_dates, columns=["Date"])
99
-
100
- # Preprocess and aggregate data based on the original interval
101
- if interval != "daily":
102
- # Resample to start of each week/month, then sum values for the same period
103
- if interval == "weekly":
104
- df = df.resample("W-MON", on="Date").sum().reset_index()
105
- elif interval == "monthly":
106
- df = df.resample("MS", on="Date").sum().reset_index()
107
-
108
- # Distribute values equally across the days in each week/month
109
- expanded_rows = []
110
- for _, row in df.iterrows():
111
- if interval == "weekly":
112
- period_dates = pd.date_range(row["Date"], periods=7)
113
- elif interval == "monthly":
114
- period_end = row["Date"] + pd.offsets.MonthEnd(1)
115
- period_dates = pd.date_range(row["Date"], period_end)
116
-
117
- for date in period_dates:
118
- new_row = row.copy()
119
- new_row["Date"] = date
120
- for col in df.columns:
121
- if col != "Date": # Skip 'Date' column
122
- new_row[col] = row[col] / len(period_dates)
123
- expanded_rows.append(new_row)
124
-
125
- # Create a DataFrame from expanded rows
126
- expanded_df = pd.DataFrame(expanded_rows)
127
- new_df = pd.merge(new_df, expanded_df, how="left", on="Date")
128
- else:
129
- # Daily data, aggregate if there are multiple entries for the same day
130
- df = df.groupby("Date").sum().reset_index()
131
- new_df = pd.merge(new_df, df, how="left", on="Date")
132
 
133
- # Ensure all dates from start to end are present, filling missing values with NaN
134
- new_df["Date"] = pd.to_datetime(new_df["Date"]) # Ensure 'Date' is datetime type
135
- new_df = new_df.set_index("Date").reindex(all_dates).reset_index()
136
- new_df.rename(columns={"index": "Date"}, inplace=True)
 
 
 
 
 
 
 
137
 
138
- return new_df
 
 
 
139
 
 
 
140
 
141
- # Function to convert a DataFrame from daily level granularity to either weekly or monthly level
142
- def convert_to_higher_granularity(df, required_granularity):
143
- if required_granularity == "daily":
144
- return df
145
 
146
- # Ensure 'Date' is the index and is in datetime format
147
- if not pd.api.types.is_datetime64_any_dtype(df["Date"]):
148
- df["Date"] = pd.to_datetime(df["Date"])
149
- df.set_index("Date", inplace=True)
150
 
151
- # Resample and aggregate
152
- if required_granularity == "weekly":
153
- # Resample to weekly, using 'W-MON' to indicate weeks starting on Monday
154
- df = df.resample("W-MON").sum()
155
- elif required_granularity == "monthly":
156
- # Resample to monthly, using 'MS' to indicate month start
157
- df = df.resample("MS").sum()
158
 
159
- # Reset index to move 'Date' back to a column
160
- df.reset_index(inplace=True)
161
 
162
- return df
 
 
163
 
 
 
164
 
165
- # # Read the CSV file, parsing 'Date' column as datetime
166
- main_df = pd.read_csv("Media_data_for_model_dma_level.csv", dayfirst=True, parse_dates=["Date"])
167
- # st.write(main_df)
168
 
169
- # Get the start date (minimum) and end date (maximum) from the 'Date' column
170
- api_start_date = main_df["Date"].min()
171
- api_end_date = main_df["Date"].max()
 
 
 
172
 
173
- # Infer the granularity from the most common difference between consecutive dates
174
- date_diffs = main_df["Date"].diff().dt.days.dropna()
175
- common_diff = date_diffs.mode()[0]
176
- api_granularity = determine_data_interval(common_diff)
 
 
177
 
178
- # Convert the DataFrame to daily level granularity
179
- main_df = expand_to_daily(main_df, api_granularity, api_start_date, api_end_date)
180
 
181
- # Page Title
182
- st.title("Data Import")
183
 
184
- # File uploader
185
- uploaded_files = st.file_uploader(
186
- "Upload additional data", type=["xlsx"], accept_multiple_files=True
187
- )
188
 
189
- # Custom HTML for upload instructions
190
- recommendation_html = f"""
191
- <div style="text-align: justify;">
192
- <strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values, and aggregated to a {api_granularity} level.
193
- </div>
194
- """
 
 
 
 
 
 
 
195
 
196
- st.markdown(recommendation_html, unsafe_allow_html=True)
197
 
198
- # Initialize a list to collect all processed DataFrames
199
- all_data_dfs = []
200
 
201
- if uploaded_files:
202
- for uploaded_file in uploaded_files:
203
- # Extract the file name
204
- file_name = uploaded_file.name
205
 
206
- # Load the file into a DataFrame
207
- data_df = pd.read_excel(
208
- uploaded_file,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
- # Identify numeric columns in the DataFrame
212
- numeric_columns = data_df.select_dtypes(include="number").columns.tolist()
 
213
 
214
- # Validate the 'Date' column and ensure there's at least one numeric column
215
- if validate_date_column(data_df) and len(numeric_columns) > 0:
216
- data_df = data_df[["Date"] + numeric_columns]
217
 
218
- # Ensure the 'Date' column is in datetime format and sorted
219
- data_df["Date"] = pd.to_datetime(data_df["Date"], dayfirst=True)
220
- data_df.sort_values("Date", inplace=True)
221
 
222
- # Calculate the most common day difference between dates to determine frequency
223
- common_freq = data_df["Date"].diff().dt.days.dropna().mode()[0]
224
 
225
- # Calculate the data interval (daily, weekly, monthly or irregular)
226
- interval = determine_data_interval(common_freq)
227
 
228
- if interval == "irregular":
229
- # Warn the user if the 'Date' column doesn't meet the format requirements
230
- st.warning(
231
- f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval."
232
- )
233
- continue
234
 
235
- # Convert data to specified interval and redistribute to daily
236
- data_df = convert_and_fill_dates(
237
- data_df, api_start_date, api_end_date, interval
238
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
- # Add the processed DataFrame to the list
241
- all_data_dfs.append(data_df)
242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  else:
244
- # Warn the user if the 'Date' column doesn't meet the format requirements
245
- st.warning(
246
- f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column."
247
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
- # Sequentially merge each of the other DataFrames with the main DataFrame on 'Date'
250
- for df in all_data_dfs:
251
- main_df = pd.merge(main_df, df, on="Date", how="left")
252
-
253
-
254
- # Function to calculate missing stats and prepare for editable DataFrame
255
- def prepare_missing_stats_df(df):
256
- missing_stats = []
257
- for column in df.columns:
258
- if (
259
- column == "Date" or column == "Total Approved Accounts - Revenue"
260
- ): # Skip Date and Revenue column
261
- continue
262
-
263
- missing = df[column].isnull().sum()
264
- pct_missing = round((missing / len(df)) * 100, 2)
265
- missing_stats.append(
266
- {
267
- "Column": column,
268
- "Missing Values": missing,
269
- "Missing Percentage": pct_missing,
270
- "Impute Method": "Fill with 0", # Default value
271
- "Category": "Media", # Default value
272
- }
273
  )
274
- stats_df = pd.DataFrame(missing_stats)
275
- return stats_df
276
-
277
-
278
- # Prepare missing stats DataFrame for editing
279
- missing_stats_df = prepare_missing_stats_df(main_df)
280
-
281
- # Create an editable DataFrame in Streamlit
282
- st.markdown("#### Select Variables Category & Impute Missing Values")
283
-
284
- edited_stats_df = st.data_editor(
285
- missing_stats_df,
286
- column_config={
287
- "Impute Method": st.column_config.SelectboxColumn(
288
- options=[
289
- "Drop Column",
290
- "Fill with Mean",
291
- "Fill with Median",
292
- "Fill with 0",
293
- ],
294
- required=True,
295
- default="Fill with 0",
296
- ),
297
- "Category": st.column_config.SelectboxColumn(
298
- options=[
299
- "Date",
300
- "Media",
301
- "Exogenous",
302
- "Internal",
303
- "DMA/Panel",
304
- "Response_Metric"
305
- ],
306
- required=True,
307
- default="Media",
308
- ),
309
- },
310
- disabled=["Column", "Missing Values", "Missing Percentage"],
311
- hide_index=True,
312
- use_container_width=True,
 
 
 
 
 
 
 
 
 
313
  )
314
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
- # Apply changes based on edited DataFrame
317
- for i, row in edited_stats_df.iterrows():
318
- column = row["Column"]
319
- if row["Impute Method"] == "Drop Column":
320
- main_df.drop(columns=[column], inplace=True)
321
 
322
- elif row["Impute Method"] == "Fill with Mean":
323
- main_df[column].fillna(main_df[column].mean(), inplace=True)
 
324
 
325
- elif row["Impute Method"] == "Fill with Median":
326
- main_df[column].fillna(main_df[column].median(), inplace=True)
327
 
328
- elif row["Impute Method"] == "Fill with 0":
329
- main_df[column].fillna(0, inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
- # Convert the Final DataFrame to required granularity
333
- main_df = convert_to_higher_granularity(main_df, api_granularity)
334
 
335
- # Display the Final DataFrame and exogenous variables
336
- st.markdown("#### Final DataFrame:")
337
- st.dataframe(main_df)
338
-
339
 
 
 
 
 
 
340
 
341
- # Initialize an empty dictionary to hold categories and their variables
342
- category_dict = {}
343
 
344
- # Iterate over each row in the edited DataFrame to populate the dictionary
345
- for i, row in edited_stats_df.iterrows():
346
- column = row["Column"]
347
- category = row["Category"] # The category chosen by the user for this variable
348
 
349
- # Check if the category already exists in the dictionary
350
- if category not in category_dict:
351
- # If not, initialize it with the current column as its first element
352
- category_dict[category] = [column]
353
- else:
354
- # If it exists, append the current column to the list of variables under this category
355
- category_dict[category].append(column)
356
-
357
- # Display the dictionary
358
- st.markdown("#### Variable Category:")
359
- for category, variables in category_dict.items():
360
- # Check if there are multiple variables to handle "and" insertion correctly
361
- if len(variables) > 1:
362
- # Join all but the last variable with ", ", then add " and " before the last variable
363
- variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
364
- else:
365
- # If there's only one variable, no need for "and"
366
- variables_str = variables[0]
367
-
368
- # Display the category and its variables in the desired format
369
- st.markdown(f"**{category}:** {variables_str}\n\n", unsafe_allow_html=True)
370
-
371
- # storing maindf and categories in session_state
372
- # st.write(main_df)
373
-
374
-
375
- # st.session_state['Cleaned_data']=main_df
376
-
377
- # st.session_state['category_dict']=category_dict
378
- if st.button('Save Changes'):
379
-
380
- with open("Pickle_files/main_df", 'wb') as f:
381
- pickle.dump(main_df, f)
382
- with open("Pickle_files/category_dict",'wb') as c:
383
- pickle.dump(category_dict,c)
384
- st.success('Changes Saved!')
 
1
  # Importing necessary libraries
2
  import streamlit as st
 
3
 
4
  st.set_page_config(
5
  page_title="Model Build",
 
8
  initial_sidebar_state="collapsed",
9
  )
10
 
 
11
  import numpy as np
12
  import pandas as pd
13
+ from utilities import set_header, load_local_css, load_authenticator
14
+ import pickle
15
+
16
 
17
  load_local_css("styles.css")
18
  set_header()
19
 
20
+ authenticator = st.session_state.get("authenticator")
 
 
 
 
 
21
  if authenticator is None:
22
  authenticator = load_authenticator()
23
 
24
+ name, authentication_status, username = authenticator.login("Login", "main")
25
+ auth_status = st.session_state.get("authentication_status")
26
+
27
+ # Check for authentication status
28
+ if auth_status != True:
29
+ st.stop()
30
+
31
+
32
+ # Function to validate date column in dataframe
33
+ def validate_date_column(df):
34
+ try:
35
+ # Attempt to convert the 'Date' column to datetime
36
+ df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
37
+ return True
38
+ except:
39
+ return False
40
+
41
+
42
+ # Function to determine data interval
43
+ def determine_data_interval(common_freq):
44
+ if common_freq == 1:
45
+ return "daily"
46
+ elif common_freq == 7:
47
+ return "weekly"
48
+ elif 28 <= common_freq <= 31:
49
+ return "monthly"
50
+ else:
51
+ return "irregular"
52
+
53
+
54
+ # Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
55
+ st.cache_resource(show_spinner=False)
56
+
57
+
58
+ def files_to_dataframes(uploaded_files):
59
+ df_dict = {}
60
+ for uploaded_file in uploaded_files:
61
+ # Extract file name without extension
62
+ file_name = uploaded_file.name.rsplit(".", 1)[0]
63
+
64
+ # Check for duplicate file names
65
+ if file_name in df_dict:
66
+ st.warning(
67
+ f"Duplicate File: {file_name}. This file will be skipped.",
68
+ icon="⚠️",
69
+ )
70
+ continue
71
+
72
+ # Read the file into a DataFrame
73
+ df = pd.read_excel(uploaded_file)
74
+
75
+ # Convert all column names to lowercase
76
+ df.columns = df.columns.str.lower().str.strip()
77
+
78
+ # Separate numeric and non-numeric columns
79
+ numeric_cols = list(df.select_dtypes(include=["number"]).columns)
80
+ non_numeric_cols = [
81
+ col
82
+ for col in df.select_dtypes(exclude=["number"]).columns
83
+ if col.lower() != "date"
84
+ ]
85
+
86
+ # Check for 'Date' column
87
+ if not (validate_date_column(df) and len(numeric_cols) > 0):
88
+ st.warning(
89
+ f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
90
+ icon="⚠️",
91
+ )
92
+ continue
93
+
94
+ # Check for interval
95
+ common_freq = common_freq = (
96
+ pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
97
+ )
98
+ # Calculate the data interval (daily, weekly, monthly or irregular)
99
+ interval = determine_data_interval(common_freq)
100
+ if interval == "irregular":
101
+ st.warning(
102
+ f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
103
+ icon="⚠️",
104
+ )
105
+ continue
106
+
107
+ # Store both DataFrames in the dictionary under their respective keys
108
+ df_dict[file_name] = {
109
+ "numeric": numeric_cols,
110
+ "non_numeric": non_numeric_cols,
111
+ "interval": interval,
112
+ "df": df,
113
+ }
114
+
115
+ return df_dict
116
+
117
+
118
+ # Function to adjust dataframe granularity
119
+ # def adjust_dataframe_granularity(df, current_granularity, target_granularity):
120
+ # # Set index
121
+ # df.set_index("date", inplace=True)
122
+
123
+ # # Define aggregation rules for resampling
124
+ # aggregation_rules = {
125
+ # col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
126
+ # for col in df.columns
127
+ # }
128
+
129
+ # resampled_df = df
130
+ # if current_granularity == "daily" and target_granularity == "weekly":
131
+ # resampled_df = df.resample("W-MON").agg(aggregation_rules)
132
+
133
+ # elif current_granularity == "daily" and target_granularity == "monthly":
134
+ # resampled_df = df.resample("MS").agg(aggregation_rules)
135
+
136
+ # elif current_granularity == "daily" and target_granularity == "daily":
137
+ # resampled_df = df.resample("D").agg(aggregation_rules)
138
+
139
+ # elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
140
+ # # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
141
+ # expanded_data = []
142
+ # for _, row in df.iterrows():
143
+ # if current_granularity == "weekly":
144
+ # period_range = pd.date_range(start=row.name, periods=7)
145
+ # elif current_granularity == "monthly":
146
+ # period_range = pd.date_range(
147
+ # start=row.name, periods=row.name.days_in_month
148
+ # )
149
+
150
+ # for date in period_range:
151
+ # new_row = {}
152
+ # for col in df.columns:
153
+ # if pd.api.types.is_numeric_dtype(df[col]):
154
+ # if current_granularity == "weekly":
155
+ # new_row[col] = row[col] / 7
156
+ # elif current_granularity == "monthly":
157
+ # new_row[col] = row[col] / row.name.days_in_month
158
+ # else:
159
+ # new_row[col] = row[col]
160
+ # expanded_data.append((date, new_row))
161
+
162
+ # resampled_df = pd.DataFrame(
163
+ # [data for _, data in expanded_data],
164
+ # index=[date for date, _ in expanded_data],
165
+ # )
166
+
167
+ # # Reset index
168
+ # resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
169
+
170
+ # return resampled_df
171
+
172
+
173
+ def adjust_dataframe_granularity(df, current_granularity, target_granularity):
174
+ # Set index
175
+ df.set_index("date", inplace=True)
176
+
177
+ # Define aggregation rules for resampling
178
+ aggregation_rules = {
179
+ col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
180
+ for col in df.columns
181
+ }
182
+
183
+ # Initialize resampled_df
184
+ resampled_df = df
185
+ if current_granularity == "daily" and target_granularity == "weekly":
186
+ resampled_df = df.resample("W-MON", closed="left", label="left").agg(
187
+ aggregation_rules
188
+ )
189
+
190
+ elif current_granularity == "daily" and target_granularity == "monthly":
191
+ resampled_df = df.resample("MS", closed="left", label="left").agg(
192
+ aggregation_rules
193
+ )
194
+
195
+ elif current_granularity == "daily" and target_granularity == "daily":
196
+ resampled_df = df.resample("D").agg(aggregation_rules)
197
+
198
+ elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
199
+ # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
200
+ expanded_data = []
201
+ for _, row in df.iterrows():
202
+ if current_granularity == "weekly":
203
+ period_range = pd.date_range(start=row.name, periods=7)
204
+ elif current_granularity == "monthly":
205
+ period_range = pd.date_range(
206
+ start=row.name, periods=row.name.days_in_month
207
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
+ for date in period_range:
210
+ new_row = {}
211
+ for col in df.columns:
212
+ if pd.api.types.is_numeric_dtype(df[col]):
213
+ if current_granularity == "weekly":
214
+ new_row[col] = row[col] / 7
215
+ elif current_granularity == "monthly":
216
+ new_row[col] = row[col] / row.name.days_in_month
217
+ else:
218
+ new_row[col] = row[col]
219
+ expanded_data.append((date, new_row))
220
 
221
+ resampled_df = pd.DataFrame(
222
+ [data for _, data in expanded_data],
223
+ index=[date for date, _ in expanded_data],
224
+ )
225
 
226
+ # Reset index
227
+ resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
228
 
229
+ return resampled_df
 
 
 
230
 
 
 
 
 
231
 
232
+ # Function to clean and extract unique values of DMA and Panel
233
+ st.cache_resource(show_spinner=False)
 
 
 
 
 
234
 
 
 
235
 
236
+ def clean_and_extract_unique_values(files_dict, selections):
237
+ all_dma_values = set()
238
+ all_panel_values = set()
239
 
240
+ for file_name, file_data in files_dict.items():
241
+ df = file_data["df"]
242
 
243
+ # 'DMA' and 'Panel' selections
244
+ selected_dma = selections[file_name].get("DMA")
245
+ selected_panel = selections[file_name].get("Panel")
246
 
247
+ # Clean and standardize DMA column if it exists and is selected
248
+ if selected_dma and selected_dma != "N/A" and selected_dma in df.columns:
249
+ df[selected_dma] = (
250
+ df[selected_dma].str.lower().str.strip().str.replace("_", " ")
251
+ )
252
+ all_dma_values.update(df[selected_dma].dropna().unique())
253
 
254
+ # Clean and standardize Panel column if it exists and is selected
255
+ if selected_panel and selected_panel != "N/A" and selected_panel in df.columns:
256
+ df[selected_panel] = (
257
+ df[selected_panel].str.lower().str.strip().str.replace("_", " ")
258
+ )
259
+ all_panel_values.update(df[selected_panel].dropna().unique())
260
 
261
+ # Update the processed DataFrame back in the dictionary
262
+ files_dict[file_name]["df"] = df
263
 
264
+ return all_dma_values, all_panel_values
 
265
 
 
 
 
 
266
 
267
+ # Function to format values for display
268
+ st.cache_resource(show_spinner=False)
269
+
270
+
271
+ def format_values_for_display(values_list):
272
+ # Capitalize the first letter of each word and replace underscores with spaces
273
+ formatted_list = [value.replace("_", " ").title() for value in values_list]
274
+ # Join values with commas and 'and' before the last value
275
+ if len(formatted_list) > 1:
276
+ return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
277
+ elif formatted_list:
278
+ return formatted_list[0]
279
+ return "No values available"
280
 
 
281
 
282
+ # Function to normalizes all data within files_dict to a daily granularity
283
+ st.cache(show_spinner=False, allow_output_mutation=True)
284
 
 
 
 
 
285
 
286
+ def standardize_data_to_daily(files_dict, selections):
287
+ # Normalize all data to a daily granularity using a provided function
288
+ files_dict = apply_granularity_to_all(files_dict, "daily", selections)
289
+
290
+ # Update the "interval" attribute for each dataset to indicate the new granularity
291
+ for files_name, files_data in files_dict.items():
292
+ files_data["interval"] = "daily"
293
+
294
+ return files_dict
295
+
296
+
297
+ # Function to apply granularity transformation to all DataFrames in files_dict
298
+ st.cache_resource(show_spinner=False)
299
+
300
+
301
+ def apply_granularity_to_all(files_dict, granularity_selection, selections):
302
+ for file_name, file_data in files_dict.items():
303
+ df = file_data["df"].copy()
304
+
305
+ # Handling when DMA or Panel might be 'N/A'
306
+ selected_dma = selections[file_name].get("DMA")
307
+ selected_panel = selections[file_name].get("Panel")
308
+
309
+ # Correcting the segment selection logic & handling 'N/A'
310
+ if selected_dma != "N/A" and selected_panel != "N/A":
311
+ unique_combinations = df[[selected_dma, selected_panel]].drop_duplicates()
312
+ elif selected_dma != "N/A":
313
+ unique_combinations = df[[selected_dma]].drop_duplicates()
314
+ selected_panel = None # Ensure Panel is ignored if N/A
315
+ elif selected_panel != "N/A":
316
+ unique_combinations = df[[selected_panel]].drop_duplicates()
317
+ selected_dma = None # Ensure DMA is ignored if N/A
318
+ else:
319
+ # If both are 'N/A', process the entire dataframe as is
320
+ df = adjust_dataframe_granularity(
321
+ df, file_data["interval"], granularity_selection
322
  )
323
+ files_dict[file_name]["df"] = df
324
+ continue # Skip to the next file
325
+
326
+ transformed_segments = []
327
+ for _, combo in unique_combinations.iterrows():
328
+ if selected_dma and selected_panel:
329
+ segment = df[
330
+ (df[selected_dma] == combo[selected_dma])
331
+ & (df[selected_panel] == combo[selected_panel])
332
+ ]
333
+ elif selected_dma:
334
+ segment = df[df[selected_dma] == combo[selected_dma]]
335
+ elif selected_panel:
336
+ segment = df[df[selected_panel] == combo[selected_panel]]
337
+
338
+ # Adjust granularity of the segment
339
+ transformed_segment = adjust_dataframe_granularity(
340
+ segment, file_data["interval"], granularity_selection
341
+ )
342
+ transformed_segments.append(transformed_segment)
343
+
344
+ # Combine all transformed segments into a single DataFrame for this file
345
+ transformed_df = pd.concat(transformed_segments, ignore_index=True)
346
+ files_dict[file_name]["df"] = transformed_df
347
+
348
+ return files_dict
349
+
350
+
351
+ # Function to create main dataframe structure
352
+ st.cache_resource(show_spinner=False)
353
+
354
+
355
+ def create_main_dataframe(
356
+ files_dict, all_dma_values, all_panel_values, granularity_selection
357
+ ):
358
+ # Determine the global start and end dates across all DataFrames
359
+ global_start = min(df["df"]["date"].min() for df in files_dict.values())
360
+ global_end = max(df["df"]["date"].max() for df in files_dict.values())
361
+
362
+ # Adjust the date_range generation based on the granularity_selection
363
+ if granularity_selection == "weekly":
364
+ # Generate a weekly range, with weeks starting on Monday
365
+ date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
366
+ elif granularity_selection == "monthly":
367
+ # Generate a monthly range, starting from the first day of each month
368
+ date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
369
+ else: # Default to daily if not weekly or monthly
370
+ date_range = pd.date_range(start=global_start, end=global_end, freq="D")
371
+
372
+ # Collect all unique DMA and Panel values, excluding 'N/A'
373
+ all_dmas = all_dma_values
374
+ all_panels = all_panel_values
375
+
376
+ # Dynamically build the list of dimensions (Panel, DMA) to include in the main DataFrame based on availability
377
+ dimensions, merge_keys = [], []
378
+ if all_panels:
379
+ dimensions.append(all_panels)
380
+ merge_keys.append("Panel")
381
+ if all_dmas:
382
+ dimensions.append(all_dmas)
383
+ merge_keys.append("DMA")
384
+
385
+ dimensions.append(date_range) # Date range is always included
386
+ merge_keys.append("date") # Date range is always included
387
+
388
+ # Create a main DataFrame template with the dimensions
389
+ main_df = pd.MultiIndex.from_product(
390
+ dimensions,
391
+ names=[name for name, _ in zip(merge_keys, dimensions)],
392
+ ).to_frame(index=False)
393
+
394
+ return main_df.reset_index(drop=True)
395
+
396
+
397
+ # Function to prepare and merge dataFrames
398
+ st.cache_resource(show_spinner=False)
399
+
400
+
401
+ def merge_into_main_df(main_df, files_dict, selections):
402
+ for file_name, file_data in files_dict.items():
403
+ df = file_data["df"].copy()
404
+
405
+ # Rename selected DMA and Panel columns if not 'N/A'
406
+ selected_dma = selections[file_name].get("DMA", "N/A")
407
+ selected_panel = selections[file_name].get("Panel", "N/A")
408
+ if selected_dma != "N/A":
409
+ df.rename(columns={selected_dma: "DMA"}, inplace=True)
410
+ if selected_panel != "N/A":
411
+ df.rename(columns={selected_panel: "Panel"}, inplace=True)
412
+
413
+ # Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel' and 'DMA'
414
+ merge_keys = ["date"]
415
+ if "Panel" in df.columns:
416
+ merge_keys.append("Panel")
417
+ if "DMA" in df.columns:
418
+ merge_keys.append("DMA")
419
+ main_df = pd.merge(main_df, df, on=merge_keys, how="left")
420
+
421
+ # After all merges, sort by 'date' and reset index for cleanliness
422
+ sort_by = ["date"]
423
+ if "Panel" in main_df.columns:
424
+ sort_by.append("Panel")
425
+ if "DMA" in main_df.columns:
426
+ sort_by.append("DMA")
427
+ main_df.sort_values(by=sort_by, inplace=True)
428
+ main_df.reset_index(drop=True, inplace=True)
429
+
430
+ return main_df
431
+
432
+
433
+ # Function to categorize column
434
+ def categorize_column(column_name):
435
+ # Define keywords for each category
436
+ internal_keywords = [
437
+ "Price",
438
+ "Discount",
439
+ "product_price",
440
+ "cost",
441
+ "margin",
442
+ "inventory",
443
+ "sales",
444
+ "revenue",
445
+ "turnover",
446
+ "expense",
447
+ ]
448
+ exogenous_keywords = [
449
+ "GDP",
450
+ "Tax",
451
+ "Inflation",
452
+ "interest_rate",
453
+ "employment_rate",
454
+ "exchange_rate",
455
+ "consumer_spending",
456
+ "retail_sales",
457
+ "oil_prices",
458
+ "weather",
459
+ ]
460
+
461
+ # Check if the column name matches any of the keywords for Internal or Exogenous categories
462
+ for keyword in internal_keywords:
463
+ if keyword.lower() in column_name.lower():
464
+ return "Internal"
465
+ for keyword in exogenous_keywords:
466
+ if keyword.lower() in column_name.lower():
467
+ return "Exogenous"
468
+
469
+ # Default to Media if no match found
470
+ return "Media"
471
+
472
+
473
+ # Function to calculate missing stats and prepare for editable DataFrame
474
+ st.cache_resource(show_spinner=False)
475
+
476
+
477
+ def prepare_missing_stats_df(df):
478
+ missing_stats = []
479
+ for column in df.columns:
480
+ if (
481
+ column == "date" or column == "DMA" or column == "Panel"
482
+ ): # Skip Date, DMA and Panel column
483
+ continue
484
+
485
+ missing = df[column].isnull().sum()
486
+ pct_missing = round((missing / len(df)) * 100, 2)
487
+
488
+ # Dynamically assign category based on column name
489
+ # category = categorize_column(column)
490
+ category = "Media"
491
+
492
+ missing_stats.append(
493
+ {
494
+ "Column": column,
495
+ "Missing Values": missing,
496
+ "Missing Percentage": pct_missing,
497
+ "Impute Method": "Fill with 0", # Default value
498
+ "Category": category,
499
+ }
500
+ )
501
+ stats_df = pd.DataFrame(missing_stats)
502
+
503
+ return stats_df
504
+
505
+
506
+ # Function to add API DataFrame details to the files dictionary
507
+ st.cache_resource(show_spinner=False)
508
+
509
+
510
+ def add_api_dataframe_to_dict(main_df, files_dict):
511
+ files_dict["API"] = {
512
+ "numeric": list(main_df.select_dtypes(include=["number"]).columns),
513
+ "non_numeric": [
514
+ col
515
+ for col in main_df.select_dtypes(exclude=["number"]).columns
516
+ if col.lower() != "date"
517
+ ],
518
+ "interval": determine_data_interval(
519
+ pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
520
+ ),
521
+ "df": main_df,
522
+ }
523
+
524
+ return files_dict
525
+
526
+
527
+ # Function to reads an API into a DataFrame, parsing specified columns as datetime
528
+ @st.cache_resource(show_spinner=False)
529
+ def read_API_data():
530
+ return pd.read_excel(r"upf_data_converted.xlsx", parse_dates=["Date"])
531
+
532
+
533
+ # Function to set the 'DMA_Panel_Selected' session state variable to False
534
+ def set_DMA_Panel_Selected_false():
535
+ st.session_state["DMA_Panel_Selected"] = False
536
+
537
+
538
+ # Initialize 'final_df' in session state
539
+ if "final_df" not in st.session_state:
540
+ st.session_state["final_df"] = pd.DataFrame()
541
+
542
+ # Initialize 'bin_dict' in session state
543
+ if "bin_dict" not in st.session_state:
544
+ st.session_state["bin_dict"] = {}
545
+
546
+ # Initialize 'DMA_Panel_Selected' in session state
547
+ if "DMA_Panel_Selected" not in st.session_state:
548
+ st.session_state["DMA_Panel_Selected"] = False
549
 
550
+ # Page Title
551
+ st.write("") # Top padding
552
+ st.title("Data Import")
553
 
 
 
 
554
 
555
+ #########################################################################################################################################################
556
+ # Create a dictionary to hold all DataFrames and collect user input to specify "DMA" and "Panel" columns for each file
557
+ #########################################################################################################################################################
558
 
 
 
559
 
560
+ # Read the Excel file, parsing 'Date' column as datetime
561
+ main_df = read_API_data()
562
 
563
+ # Convert all column names to lowercase
564
+ main_df.columns = main_df.columns.str.lower().str.strip()
 
 
 
 
565
 
566
+ # File uploader
567
+ uploaded_files = st.file_uploader(
568
+ "Upload additional data",
569
+ type=["xlsx"],
570
+ accept_multiple_files=True,
571
+ on_change=set_DMA_Panel_Selected_false,
572
+ )
573
+
574
+ # Custom HTML for upload instructions
575
+ recommendation_html = f"""
576
+ <div style="text-align: justify;">
577
+ <strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including DMA, Panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
578
+ </div>
579
+ """
580
+ st.markdown(recommendation_html, unsafe_allow_html=True)
581
+
582
+ # Choose Date Granularity
583
+ st.markdown("#### Choose Date Granularity")
584
+ # Granularity Selection
585
+ granularity_selection = st.selectbox(
586
+ "Choose Date Granularity",
587
+ ["Daily", "Weekly", "Monthly"],
588
+ label_visibility="collapsed",
589
+ on_change=set_DMA_Panel_Selected_false,
590
+ )
591
+ granularity_selection = str(granularity_selection).lower()
592
 
593
+ # Convert files to dataframes
594
+ files_dict = files_to_dataframes(uploaded_files)
595
 
596
+ # Add API Dataframe
597
+ if main_df is not None:
598
+ files_dict = add_api_dataframe_to_dict(main_df, files_dict)
599
+
600
+ # Display a warning message if no files have been uploaded and halt further execution
601
+ if not files_dict:
602
+ st.warning(
603
+ "Please upload at least one file to proceed.",
604
+ icon="⚠️",
605
+ )
606
+ st.stop() # Halts further execution until file is uploaded
607
+
608
+
609
+ # Select DMA and Panel columns
610
+ st.markdown("#### Select DMA and Panel columns")
611
+ selections = {}
612
+ with st.expander("Select DMA and Panel columns", expanded=False):
613
+ count = 0 # Initialize counter to manage the visibility of labels and keys
614
+ for file_name, file_data in files_dict.items():
615
+ # Determine visibility of the label based on the count
616
+ if count == 0:
617
+ label_visibility = "visible"
618
+ else:
619
+ label_visibility = "collapsed"
620
+
621
+ # Extract non-numeric columns
622
+ non_numeric_cols = file_data["non_numeric"]
623
+
624
+ # Prepare DMA and Panel values for dropdown, adding "N/A" as an option
625
+ dma_values = non_numeric_cols + ["N/A"]
626
+ panel_values = non_numeric_cols + ["N/A"]
627
+
628
+ # Skip if only one option is available
629
+ if len(dma_values) == 1 and len(panel_values) == 1:
630
+ selected_dma, selected_panel = "N/A", "N/A"
631
+ # Update the selections for DMA and Panel for the current file
632
+ selections[file_name] = {
633
+ "DMA": selected_dma,
634
+ "Panel": selected_panel,
635
+ }
636
+ continue
637
+
638
+ # Create layout columns for File Name, DMA, and Panel selections
639
+ file_name_col, DMA_col, Panel_col = st.columns([2, 4, 4])
640
+
641
+ with file_name_col:
642
+ # Display "File Name" label only for the first file
643
+ if count == 0:
644
+ st.write("File Name")
645
  else:
646
+ st.write("")
647
+ st.write(file_name) # Display the file name
648
+
649
+ with DMA_col:
650
+ # Display a selectbox for DMA values
651
+ selected_dma = st.selectbox(
652
+ "Select DMA",
653
+ dma_values,
654
+ on_change=set_DMA_Panel_Selected_false,
655
+ label_visibility=label_visibility, # Control visibility of the label
656
+ key=f"DMA_selectbox{count}", # Ensure unique key for each selectbox
657
+ )
658
+
659
+ with Panel_col:
660
+ # Display a selectbox for Panel values
661
+ selected_panel = st.selectbox(
662
+ "Select Panel",
663
+ panel_values,
664
+ on_change=set_DMA_Panel_Selected_false,
665
+ label_visibility=label_visibility, # Control visibility of the label
666
+ key=f"Panel_selectbox{count}", # Ensure unique key for each selectbox
667
+ )
668
+
669
+ # Skip processing if the same column is selected for both Panel and DMA due to potential data integrity issues
670
+ if selected_panel == selected_dma and not (
671
+ selected_panel == "N/A" and selected_dma == "N/A"
672
+ ):
673
+ st.warning(
674
+ f"File: {file_name} → The same column cannot serve as both Panel and DMA. Please adjust your selections.",
675
+ )
676
+ selected_dma, selected_panel = "N/A", "N/A"
677
+ st.stop()
678
+
679
+ # Update the selections for DMA and Panel for the current file
680
+ selections[file_name] = {
681
+ "DMA": selected_dma,
682
+ "Panel": selected_panel,
683
+ }
684
+
685
+ count += 1 # Increment the counter after processing each file
686
+
687
+ # Accept DMA and Panel selection
688
+ if st.button("Accept and Process", use_container_width=True):
689
+
690
+ # Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
691
+ with st.spinner("Processing...", cache=True):
692
+ files_dict = standardize_data_to_daily(files_dict, selections)
693
 
694
+ # Convert all data to daily level granularity
695
+ files_dict = apply_granularity_to_all(
696
+ files_dict, granularity_selection, selections
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
697
  )
698
+
699
+ st.session_state["files_dict"] = files_dict
700
+ st.session_state["DMA_Panel_Selected"] = True
701
+
702
+
703
+ #########################################################################################################################################################
704
+ # Display unique DMA and Panel values
705
+ #########################################################################################################################################################
706
+
707
+
708
+ # Halts further execution until DMA and Panel columns are selected
709
+ if "files_dict" in st.session_state and st.session_state["DMA_Panel_Selected"]:
710
+ files_dict = st.session_state["files_dict"]
711
+ else:
712
+ st.stop()
713
+
714
+ # Set to store unique values of DMA and Panel
715
+ with st.spinner("Fetching DMA and Panel values..."):
716
+ all_dma_values, all_panel_values = clean_and_extract_unique_values(
717
+ files_dict, selections
718
+ )
719
+
720
+ # List of DMA and Panel columns unique values
721
+ list_of_all_dma_values = list(all_dma_values)
722
+ list_of_all_panel_values = list(all_panel_values)
723
+
724
+ # Format DMA and Panel values for display
725
+ formatted_dma_values = format_values_for_display(list_of_all_dma_values)
726
+ formatted_panel_values = format_values_for_display(list_of_all_panel_values)
727
+
728
+ # Unique DMA and Panel values
729
+ st.markdown("#### Unique DMA and Panel values")
730
+ # Display DMA and Panel values
731
+ with st.expander("Unique DMA and Panel values"):
732
+ st.write("")
733
+ st.markdown(
734
+ f"""
735
+ <style>
736
+ .justify-text {{
737
+ text-align: justify;
738
+ }}
739
+ </style>
740
+ <div class="justify-text">
741
+ <strong>Panel Values:</strong> {formatted_panel_values}<br>
742
+ <strong>DMA Values:</strong> {formatted_dma_values}
743
+ </div>
744
+ """,
745
+ unsafe_allow_html=True,
746
  )
747
 
748
+ # Display total DMA and Panel
749
+ st.write("")
750
+ st.markdown(
751
+ f"""
752
+ <div style="text-align: justify;">
753
+ <strong>Number of DMAs detected:</strong> {len(list_of_all_dma_values)}<br>
754
+ <strong>Number of Panels detected:</strong> {len(list_of_all_panel_values)}
755
+ </div>
756
+ """,
757
+ unsafe_allow_html=True,
758
+ )
759
+ st.write("")
760
 
 
 
 
 
 
761
 
762
+ #########################################################################################################################################################
763
+ # Merge all DataFrames
764
+ #########################################################################################################################################################
765
 
 
 
766
 
767
+ # Merge all DataFrames selected
768
+ main_df = create_main_dataframe(
769
+ files_dict, all_dma_values, all_panel_values, granularity_selection
770
+ )
771
+ merged_df = merge_into_main_df(main_df, files_dict, selections)
772
+
773
+ # # Display the merged DataFrame
774
+ # st.markdown("#### Merged DataFrame based on selected DMA and Panel")
775
+ # st.dataframe(merged_df)
776
+
777
+
778
+ #########################################################################################################################################################
779
+ # Categorize Variables and Impute Missing Values
780
+ #########################################################################################################################################################
781
+
782
+
783
+ # Create an editable DataFrame in Streamlit
784
+ st.markdown("#### Select Variables Category & Impute Missing Values")
785
+
786
+ # Prepare missing stats DataFrame for editing
787
+ missing_stats_df = prepare_missing_stats_df(merged_df)
788
+
789
+ edited_stats_df = st.data_editor(
790
+ missing_stats_df,
791
+ column_config={
792
+ "Impute Method": st.column_config.SelectboxColumn(
793
+ options=[
794
+ "Drop Column",
795
+ "Fill with Mean",
796
+ "Fill with Median",
797
+ "Fill with 0",
798
+ ],
799
+ required=True,
800
+ default="Fill with 0",
801
+ ),
802
+ "Category": st.column_config.SelectboxColumn(
803
+ options=[
804
+ "Media",
805
+ "Exogenous",
806
+ "Internal",
807
+ "Response_Metric"
808
+ ],
809
+ required=True,
810
+ default="Media",
811
+ ),
812
+ },
813
+ disabled=["Column", "Missing Values", "Missing Percentage"],
814
+ hide_index=True,
815
+ use_container_width=True,
816
+ )
817
 
818
+ # Apply changes based on edited DataFrame
819
+ for i, row in edited_stats_df.iterrows():
820
+ column = row["Column"]
821
+ if row["Impute Method"] == "Drop Column":
822
+ merged_df.drop(columns=[column], inplace=True)
823
+
824
+ elif row["Impute Method"] == "Fill with Mean":
825
+ merged_df[column].fillna(merged_df[column].mean(), inplace=True)
826
+
827
+ elif row["Impute Method"] == "Fill with Median":
828
+ merged_df[column].fillna(merged_df[column].median(), inplace=True)
829
+
830
+ elif row["Impute Method"] == "Fill with 0":
831
+ merged_df[column].fillna(0, inplace=True)
832
+
833
+ # Display the Final DataFrame and exogenous variables
834
+ st.markdown("#### Final DataFrame")
835
+ final_df = merged_df
836
+ st.dataframe(final_df, hide_index=True)
837
+
838
+ # Initialize an empty dictionary to hold categories and their variables
839
+ category_dict = {}
840
+
841
+ # Iterate over each row in the edited DataFrame to populate the dictionary
842
+ for i, row in edited_stats_df.iterrows():
843
+ column = row["Column"]
844
+ category = row["Category"] # The category chosen by the user for this variable
845
+
846
+ # Check if the category already exists in the dictionary
847
+ if category not in category_dict:
848
+ # If not, initialize it with the current column as its first element
849
+ category_dict[category] = [column]
850
+ else:
851
+ # If it exists, append the current column to the list of variables under this category
852
+ category_dict[category].append(column)
853
+
854
+ # Add Date, DMA and Panel in category dictionary
855
+ category_dict.update({"Date": ["date"]})
856
+ if "DMA" in final_df.columns:
857
+ category_dict["DMA"] = ["DMA"]
858
+
859
+ if "Panel" in final_df.columns:
860
+ category_dict["Panel"] = ["Panel"]
861
+
862
+ # Display the dictionary
863
+ st.markdown("#### Variable Category")
864
+ for category, variables in category_dict.items():
865
+ # Check if there are multiple variables to handle "and" insertion correctly
866
+ if len(variables) > 1:
867
+ # Join all but the last variable with ", ", then add " and " before the last variable
868
+ variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
869
+ else:
870
+ # If there's only one variable, no need for "and"
871
+ variables_str = variables[0]
872
+
873
+ # Display the category and its variables in the desired format
874
+ st.markdown(
875
+ f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
876
+ unsafe_allow_html=True,
877
+ )
878
 
879
+ # Store final dataframe and bin dictionary into session state
880
+ st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
881
 
882
+ if st.button('Save Changes'):
 
 
 
883
 
884
+ with open("Pickle_files/main_df", 'wb') as f:
885
+ pickle.dump(st.session_state["final_df"], f)
886
+ with open("Pickle_files/category_dict",'wb') as c:
887
+ pickle.dump(st.session_state["bin_dict"],c)
888
+ st.success('Changes Saved!')
889
 
 
 
890
 
 
 
 
 
891