BlendMMM commited on
Commit
95590cf
·
verified ·
1 Parent(s): 1bc259c

Update Data_Import.py

Browse files
Files changed (1) hide show
  1. Data_Import.py +846 -211
Data_Import.py CHANGED
@@ -1,79 +1,58 @@
1
  # Importing necessary libraries
2
  import streamlit as st
3
- import pickle
4
 
5
  st.set_page_config(
6
- page_title="Model Build",
7
  page_icon=":shark:",
8
  layout="wide",
9
  initial_sidebar_state="collapsed",
10
  )
11
 
12
- from utilities import load_authenticator
13
- import numpy as np
14
  import pandas as pd
15
  from utilities import set_header, load_local_css
 
 
 
16
 
17
  load_local_css("styles.css")
18
  set_header()
19
 
20
 
21
  for k, v in st.session_state.items():
22
- if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
 
 
23
  st.session_state[k] = v
24
-
25
- authenticator = st.session_state.get('authenticator')
26
- if authenticator is None:
27
- authenticator = load_authenticator()
28
-
29
- name, authentication_status, username = authenticator.login('Login', 'main')
30
- auth_status = st.session_state.get('authentication_status')
 
 
 
 
 
 
31
 
32
  if auth_status == True:
33
- is_state_initiaized = st.session_state.get('initialized',False)
34
- if not is_state_initiaized:
35
- a=1
36
-
37
 
38
- # Function to expand dataframe to daily
39
- @st.cache_resource(show_spinner=False)
40
- def expand_to_daily(df, granularity, start_date, end_date):
41
- # Create a new DataFrame with a row for each day
42
- all_dates = pd.date_range(start=start_date, end=end_date, freq="D")
43
- daily_df = pd.DataFrame(all_dates, columns=["Date"])
44
-
45
- if granularity == "daily":
46
- # For daily data, simply merge to fill missing dates
47
- daily_df = daily_df.merge(df, on="Date", how="left")
48
- else:
49
- # For weekly or monthly, distribute values to daily rows
50
- for column in df.columns:
51
- if column != "Date": # Skip 'Date' column
52
- daily_df[column] = np.nan # Initialize with NaNs
53
-
54
- # Group by the required frequency and distribute values
55
- freq = "W-MON" if granularity == "weekly" else "MS"
56
- for _, group in df.groupby(pd.Grouper(key="Date", freq=freq)):
57
- num_days = len(
58
- pd.date_range(group["Date"].min(), group["Date"].max(), freq="D")
59
- )
60
- for column in group.columns:
61
- if column == "Date": # Skip 'Date' column
62
- continue
63
- value = group[column].sum() / num_days
64
- date_range = pd.date_range(
65
- group["Date"].min(), periods=num_days, freq="D"
66
- )
67
- daily_df.loc[daily_df["Date"].isin(date_range), column] = value
68
-
69
- return daily_df
70
 
71
 
72
- # Function to validate date column in dataframe
73
  def validate_date_column(df):
74
  try:
75
  # Attempt to convert the 'Date' column to datetime
76
- df["Date"] = pd.to_datetime(df["Date"], format="%d-%m-%Y")
77
  return True
78
  except:
79
  return False
@@ -91,196 +70,786 @@ if auth_status == True:
91
  return "irregular"
92
 
93
 
94
- # Function to convert and fill dates in dataframe
95
- def convert_and_fill_dates(df, start_date, end_date, interval):
96
- # Create a date range for the desired period
97
- all_dates = pd.date_range(start=start_date, end=end_date, freq="D")
98
- new_df = pd.DataFrame(all_dates, columns=["Date"])
99
 
100
- # Preprocess and aggregate data based on the original interval
101
- if interval != "daily":
102
- # Resample to start of each week/month, then sum values for the same period
103
- if interval == "weekly":
104
- df = df.resample("W-MON", on="Date").sum().reset_index()
105
- elif interval == "monthly":
106
- df = df.resample("MS", on="Date").sum().reset_index()
107
 
108
- # Distribute values equally across the days in each week/month
109
- expanded_rows = []
110
- for _, row in df.iterrows():
111
- if interval == "weekly":
112
- period_dates = pd.date_range(row["Date"], periods=7)
113
- elif interval == "monthly":
114
- period_end = row["Date"] + pd.offsets.MonthEnd(1)
115
- period_dates = pd.date_range(row["Date"], period_end)
116
-
117
- for date in period_dates:
118
- new_row = row.copy()
119
- new_row["Date"] = date
120
- for col in df.columns:
121
- if col != "Date": # Skip 'Date' column
122
- new_row[col] = row[col] / len(period_dates)
123
- expanded_rows.append(new_row)
124
 
125
- # Create a DataFrame from expanded rows
126
- expanded_df = pd.DataFrame(expanded_rows)
127
- new_df = pd.merge(new_df, expanded_df, how="left", on="Date")
128
- else:
129
- # Daily data, aggregate if there are multiple entries for the same day
130
- df = df.groupby("Date").sum().reset_index()
131
- new_df = pd.merge(new_df, df, how="left", on="Date")
132
 
133
- # Ensure all dates from start to end are present, filling missing values with NaN
134
- new_df["Date"] = pd.to_datetime(new_df["Date"]) # Ensure 'Date' is datetime type
135
- new_df = new_df.set_index("Date").reindex(all_dates).reset_index()
136
- new_df.rename(columns={"index": "Date"}, inplace=True)
137
 
138
- return new_df
 
139
 
 
 
 
 
 
 
 
140
 
141
- # Function to convert a DataFrame from daily level granularity to either weekly or monthly level
142
- def convert_to_higher_granularity(df, required_granularity):
143
- if required_granularity == "daily":
144
- return df
 
 
 
145
 
146
- # Ensure 'Date' is the index and is in datetime format
147
- if not pd.api.types.is_datetime64_any_dtype(df["Date"]):
148
- df["Date"] = pd.to_datetime(df["Date"])
149
- df.set_index("Date", inplace=True)
 
 
 
 
 
 
 
 
150
 
151
- # Resample and aggregate
152
- if required_granularity == "weekly":
153
- # Resample to weekly, using 'W-MON' to indicate weeks starting on Monday
154
- df = df.resample("W-MON").sum()
155
- elif required_granularity == "monthly":
156
- # Resample to monthly, using 'MS' to indicate month start
157
- df = df.resample("MS").sum()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
- # Reset index to move 'Date' back to a column
160
- df.reset_index(inplace=True)
 
 
161
 
162
- return df
 
163
 
 
 
 
 
 
 
 
 
 
 
164
 
165
- # # Read the CSV file, parsing 'Date' column as datetime
166
- main_df = pd.read_csv("Media_data_for_model_dma_level.csv", dayfirst=True, parse_dates=["Date"])
167
- # st.write(main_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
- # Get the start date (minimum) and end date (maximum) from the 'Date' column
170
- api_start_date = main_df["Date"].min()
171
- api_end_date = main_df["Date"].max()
172
 
173
- # Infer the granularity from the most common difference between consecutive dates
174
- date_diffs = main_df["Date"].diff().dt.days.dropna()
175
- common_diff = date_diffs.mode()[0]
176
- api_granularity = determine_data_interval(common_diff)
177
 
178
- # Convert the DataFrame to daily level granularity
179
- main_df = expand_to_daily(main_df, api_granularity, api_start_date, api_end_date)
180
 
181
- # Page Title
182
- st.title("Data Import")
183
 
184
- # File uploader
185
- uploaded_files = st.file_uploader(
186
- "Upload additional data", type=["xlsx"], accept_multiple_files=True
187
- )
188
 
189
- # Custom HTML for upload instructions
190
- recommendation_html = f"""
191
- <div style="text-align: justify;">
192
- <strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values, and aggregated to a {api_granularity} level.
193
- </div>
194
- """
195
 
196
- st.markdown(recommendation_html, unsafe_allow_html=True)
 
197
 
198
- # Initialize a list to collect all processed DataFrames
199
- all_data_dfs = []
 
200
 
201
- if uploaded_files:
202
- for uploaded_file in uploaded_files:
203
- # Extract the file name
204
- file_name = uploaded_file.name
 
 
 
 
 
 
205
 
206
- # Load the file into a DataFrame
207
- data_df = pd.read_excel(
208
- uploaded_file,
209
- )
 
 
 
 
 
 
210
 
211
- # Identify numeric columns in the DataFrame
212
- numeric_columns = data_df.select_dtypes(include="number").columns.tolist()
213
 
214
- # Validate the 'Date' column and ensure there's at least one numeric column
215
- if validate_date_column(data_df) and len(numeric_columns) > 0:
216
- data_df = data_df[["Date"] + numeric_columns]
217
 
218
- # Ensure the 'Date' column is in datetime format and sorted
219
- data_df["Date"] = pd.to_datetime(data_df["Date"], dayfirst=True)
220
- data_df.sort_values("Date", inplace=True)
221
 
222
- # Calculate the most common day difference between dates to determine frequency
223
- common_freq = data_df["Date"].diff().dt.days.dropna().mode()[0]
224
 
225
- # Calculate the data interval (daily, weekly, monthly or irregular)
226
- interval = determine_data_interval(common_freq)
227
 
228
- if interval == "irregular":
229
- # Warn the user if the 'Date' column doesn't meet the format requirements
230
- st.warning(
231
- f"File Name: {file_name} Please upload data in daily, weekly or monthly interval."
232
- )
233
- continue
 
 
 
234
 
235
- # Convert data to specified interval and redistribute to daily
236
- data_df = convert_and_fill_dates(
237
- data_df, api_start_date, api_end_date, interval
238
- )
239
 
240
- # Add the processed DataFrame to the list
241
- all_data_dfs.append(data_df)
 
 
 
 
 
 
 
 
 
 
 
 
242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  else:
244
- # Warn the user if the 'Date' column doesn't meet the format requirements
245
- st.warning(
246
- f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column."
247
  )
248
-
249
- # Sequentially merge each of the other DataFrames with the main DataFrame on 'Date'
250
- for df in all_data_dfs:
251
- main_df = pd.merge(main_df, df, on="Date", how="left")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
 
254
  # Function to calculate missing stats and prepare for editable DataFrame
 
 
 
255
  def prepare_missing_stats_df(df):
256
  missing_stats = []
257
  for column in df.columns:
258
  if (
259
- column == "Date" or column == "Total Approved Accounts - Revenue"
260
- ): # Skip Date and Revenue column
261
  continue
262
 
263
  missing = df[column].isnull().sum()
264
  pct_missing = round((missing / len(df)) * 100, 2)
 
 
 
 
 
265
  missing_stats.append(
266
  {
267
  "Column": column,
268
  "Missing Values": missing,
269
  "Missing Percentage": pct_missing,
270
  "Impute Method": "Fill with 0", # Default value
271
- "Category": "Media", # Default value
272
  }
273
  )
274
  stats_df = pd.DataFrame(missing_stats)
 
275
  return stats_df
276
 
277
 
278
- # Prepare missing stats DataFrame for editing
279
- missing_stats_df = prepare_missing_stats_df(main_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
  # Create an editable DataFrame in Streamlit
282
  st.markdown("#### Select Variables Category & Impute Missing Values")
283
 
 
 
 
284
  edited_stats_df = st.data_editor(
285
  missing_stats_df,
286
  column_config={
@@ -296,12 +865,10 @@ if auth_status == True:
296
  ),
297
  "Category": st.column_config.SelectboxColumn(
298
  options=[
299
- "Date",
300
  "Media",
301
  "Exogenous",
302
  "Internal",
303
- "DMA/Panel",
304
- "Response_Metric"
305
  ],
306
  required=True,
307
  default="Media",
@@ -312,31 +879,84 @@ if auth_status == True:
312
  use_container_width=True,
313
  )
314
 
315
-
316
  # Apply changes based on edited DataFrame
317
  for i, row in edited_stats_df.iterrows():
318
  column = row["Column"]
319
  if row["Impute Method"] == "Drop Column":
320
- main_df.drop(columns=[column], inplace=True)
321
 
322
  elif row["Impute Method"] == "Fill with Mean":
323
- main_df[column].fillna(main_df[column].mean(), inplace=True)
324
 
325
  elif row["Impute Method"] == "Fill with Median":
326
- main_df[column].fillna(main_df[column].median(), inplace=True)
327
 
328
  elif row["Impute Method"] == "Fill with 0":
329
- main_df[column].fillna(0, inplace=True)
 
 
 
 
 
330
 
331
 
332
- # Convert the Final DataFrame to required granularity
333
- main_df = convert_to_higher_granularity(main_df, api_granularity)
334
 
335
- # Display the Final DataFrame and exogenous variables
336
- st.markdown("#### Final DataFrame:")
337
- st.dataframe(main_df)
338
-
339
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
 
341
  # Initialize an empty dictionary to hold categories and their variables
342
  category_dict = {}
@@ -354,8 +974,15 @@ if auth_status == True:
354
  # If it exists, append the current column to the list of variables under this category
355
  category_dict[category].append(column)
356
 
 
 
 
 
 
 
 
357
  # Display the dictionary
358
- st.markdown("#### Variable Category:")
359
  for category, variables in category_dict.items():
360
  # Check if there are multiple variables to handle "and" insertion correctly
361
  if len(variables) > 1:
@@ -366,19 +993,27 @@ if auth_status == True:
366
  variables_str = variables[0]
367
 
368
  # Display the category and its variables in the desired format
369
- st.markdown(f"**{category}:** {variables_str}\n\n", unsafe_allow_html=True)
370
-
371
- # storing maindf and categories in session_state
372
- # st.write(main_df)
373
-
374
-
375
- # st.session_state['Cleaned_data']=main_df
376
-
377
- # st.session_state['category_dict']=category_dict
378
- if st.button('Save Changes'):
379
-
380
- with open("Pickle_files/main_df", 'wb') as f:
381
- pickle.dump(main_df, f)
382
- with open("Pickle_files/category_dict",'wb') as c:
383
- pickle.dump(category_dict,c)
384
- st.success('Changes Saved!')
 
 
 
 
 
 
 
 
 
1
  # Importing necessary libraries
2
  import streamlit as st
 
3
 
4
  st.set_page_config(
5
+ page_title="Data Import",
6
  page_icon=":shark:",
7
  layout="wide",
8
  initial_sidebar_state="collapsed",
9
  )
10
 
11
+ import pickle
 
12
  import pandas as pd
13
  from utilities import set_header, load_local_css
14
+ import streamlit_authenticator as stauth
15
+ import yaml
16
+ from yaml import SafeLoader
17
 
18
  load_local_css("styles.css")
19
  set_header()
20
 
21
 
22
  for k, v in st.session_state.items():
23
+ if k not in ["logout", "login", "config"] and not k.startswith(
24
+ "FormSubmitter"
25
+ ):
26
  st.session_state[k] = v
27
+ with open("config.yaml") as file:
28
+ config = yaml.load(file, Loader=SafeLoader)
29
+ st.session_state["config"] = config
30
+ authenticator = stauth.Authenticate(
31
+ config["credentials"],
32
+ config["cookie"]["name"],
33
+ config["cookie"]["key"],
34
+ config["cookie"]["expiry_days"],
35
+ config["preauthorized"],
36
+ )
37
+ st.session_state["authenticator"] = authenticator
38
+ name, authentication_status, username = authenticator.login("Login", "main")
39
+ auth_status = st.session_state.get("authentication_status")
40
 
41
  if auth_status == True:
42
+ authenticator.logout("Logout", "main")
43
+ is_state_initiaized = st.session_state.get("initialized", False)
 
 
44
 
45
+ if not is_state_initiaized:
46
+
47
+ if 'session_name' not in st.session_state:
48
+ st.session_state['session_name']=None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
 
51
+ # Function to validate date column in dataframe
52
  def validate_date_column(df):
53
  try:
54
  # Attempt to convert the 'Date' column to datetime
55
+ df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
56
  return True
57
  except:
58
  return False
 
70
  return "irregular"
71
 
72
 
73
+ # Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
74
+ st.cache_resource(show_spinner=False)
 
 
 
75
 
 
 
 
 
 
 
 
76
 
77
+ def files_to_dataframes(uploaded_files):
78
+ df_dict = {}
79
+ for uploaded_file in uploaded_files:
80
+ # Extract file name without extension
81
+ file_name = uploaded_file.name.rsplit(".", 1)[0]
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ # Check for duplicate file names
84
+ if file_name in df_dict:
85
+ st.warning(
86
+ f"Duplicate File: {file_name}. This file will be skipped.",
87
+ icon="⚠️",
88
+ )
89
+ continue
90
 
91
+ # Read the file into a DataFrame
92
+ df = pd.read_excel(uploaded_file)
 
 
93
 
94
+ # Convert all column names to lowercase
95
+ df.columns = df.columns.str.lower().str.strip()
96
 
97
+ # Separate numeric and non-numeric columns
98
+ numeric_cols = list(df.select_dtypes(include=["number"]).columns)
99
+ non_numeric_cols = [
100
+ col
101
+ for col in df.select_dtypes(exclude=["number"]).columns
102
+ if col.lower() != "date"
103
+ ]
104
 
105
+ # Check for 'Date' column
106
+ if not (validate_date_column(df) and len(numeric_cols) > 0):
107
+ st.warning(
108
+ f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
109
+ icon="⚠️",
110
+ )
111
+ continue
112
 
113
+ # Check for interval
114
+ common_freq = common_freq = (
115
+ pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
116
+ )
117
+ # Calculate the data interval (daily, weekly, monthly or irregular)
118
+ interval = determine_data_interval(common_freq)
119
+ if interval == "irregular":
120
+ st.warning(
121
+ f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
122
+ icon="⚠️",
123
+ )
124
+ continue
125
 
126
+ # Store both DataFrames in the dictionary under their respective keys
127
+ df_dict[file_name] = {
128
+ "numeric": numeric_cols,
129
+ "non_numeric": non_numeric_cols,
130
+ "interval": interval,
131
+ "df": df,
132
+ }
133
+
134
+ return df_dict
135
+
136
+
137
+ # Function to adjust dataframe granularity
138
+ def adjust_dataframe_granularity(df, current_granularity, target_granularity):
139
+ # Set index
140
+ df.set_index("date", inplace=True)
141
+
142
+ # Define aggregation rules for resampling
143
+ aggregation_rules = {
144
+ col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
145
+ for col in df.columns
146
+ }
147
+
148
+ # Initialize resampled_df
149
+ resampled_df = df
150
+ if current_granularity == "daily" and target_granularity == "weekly":
151
+ resampled_df = df.resample("W-MON", closed="left", label="left").agg(
152
+ aggregation_rules
153
+ )
154
 
155
+ elif current_granularity == "daily" and target_granularity == "monthly":
156
+ resampled_df = df.resample("MS", closed="left", label="left").agg(
157
+ aggregation_rules
158
+ )
159
 
160
+ elif current_granularity == "daily" and target_granularity == "daily":
161
+ resampled_df = df.resample("D").agg(aggregation_rules)
162
 
163
+ elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
164
+ # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
165
+ expanded_data = []
166
+ for _, row in df.iterrows():
167
+ if current_granularity == "weekly":
168
+ period_range = pd.date_range(start=row.name, periods=7)
169
+ elif current_granularity == "monthly":
170
+ period_range = pd.date_range(
171
+ start=row.name, periods=row.name.days_in_month
172
+ )
173
 
174
+ for date in period_range:
175
+ new_row = {}
176
+ for col in df.columns:
177
+ if pd.api.types.is_numeric_dtype(df[col]):
178
+ if current_granularity == "weekly":
179
+ new_row[col] = row[col] / 7
180
+ elif current_granularity == "monthly":
181
+ new_row[col] = row[col] / row.name.days_in_month
182
+ else:
183
+ new_row[col] = row[col]
184
+ expanded_data.append((date, new_row))
185
+
186
+ resampled_df = pd.DataFrame(
187
+ [data for _, data in expanded_data],
188
+ index=[date for date, _ in expanded_data],
189
+ )
190
 
191
+ # Reset index
192
+ resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
 
193
 
194
+ return resampled_df
 
 
 
195
 
 
 
196
 
197
+ # Function to clean and extract unique values of Panel_1 and Panel_2
198
+ st.cache_resource(show_spinner=False)
199
 
 
 
 
 
200
 
201
+ def clean_and_extract_unique_values(files_dict, selections):
202
+ all_panel1_values = set()
203
+ all_panel2_values = set()
 
 
 
204
 
205
+ for file_name, file_data in files_dict.items():
206
+ df = file_data["df"]
207
 
208
+ # 'Panel_1' and 'Panel_2' selections
209
+ selected_panel1 = selections[file_name].get("Panel_1")
210
+ selected_panel2 = selections[file_name].get("Panel_2")
211
 
212
+ # Clean and standardize Panel_1 column if it exists and is selected
213
+ if (
214
+ selected_panel1
215
+ and selected_panel1 != "N/A"
216
+ and selected_panel1 in df.columns
217
+ ):
218
+ df[selected_panel1] = (
219
+ df[selected_panel1].str.lower().str.strip().str.replace("_", " ")
220
+ )
221
+ all_panel1_values.update(df[selected_panel1].dropna().unique())
222
 
223
+ # Clean and standardize Panel_2 column if it exists and is selected
224
+ if (
225
+ selected_panel2
226
+ and selected_panel2 != "N/A"
227
+ and selected_panel2 in df.columns
228
+ ):
229
+ df[selected_panel2] = (
230
+ df[selected_panel2].str.lower().str.strip().str.replace("_", " ")
231
+ )
232
+ all_panel2_values.update(df[selected_panel2].dropna().unique())
233
 
234
+ # Update the processed DataFrame back in the dictionary
235
+ files_dict[file_name]["df"] = df
236
 
237
+ return all_panel1_values, all_panel2_values
 
 
238
 
 
 
 
239
 
240
+ # Function to format values for display
241
+ st.cache_resource(show_spinner=False)
242
 
 
 
243
 
244
+ def format_values_for_display(values_list):
245
+ # Capitalize the first letter of each word and replace underscores with spaces
246
+ formatted_list = [value.replace("_", " ").title() for value in values_list]
247
+ # Join values with commas and 'and' before the last value
248
+ if len(formatted_list) > 1:
249
+ return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
250
+ elif formatted_list:
251
+ return formatted_list[0]
252
+ return "No values available"
253
 
 
 
 
 
254
 
255
+ # Function to normalizes all data within files_dict to a daily granularity
256
+ st.cache(show_spinner=False, allow_output_mutation=True)
257
+
258
+
259
+ def standardize_data_to_daily(files_dict, selections):
260
+ # Normalize all data to a daily granularity using a provided function
261
+ files_dict = apply_granularity_to_all(files_dict, "daily", selections)
262
+
263
+ # Update the "interval" attribute for each dataset to indicate the new granularity
264
+ for files_name, files_data in files_dict.items():
265
+ files_data["interval"] = "daily"
266
+
267
+ return files_dict
268
+
269
 
270
+ # Function to apply granularity transformation to all DataFrames in files_dict
271
+ st.cache_resource(show_spinner=False)
272
+
273
+
274
+ def apply_granularity_to_all(files_dict, granularity_selection, selections):
275
+ for file_name, file_data in files_dict.items():
276
+ df = file_data["df"].copy()
277
+
278
+ # Handling when Panel_1 or Panel_2 might be 'N/A'
279
+ selected_panel1 = selections[file_name].get("Panel_1")
280
+ selected_panel2 = selections[file_name].get("Panel_2")
281
+
282
+ # Correcting the segment selection logic & handling 'N/A'
283
+ if selected_panel1 != "N/A" and selected_panel2 != "N/A":
284
+ unique_combinations = df[
285
+ [selected_panel1, selected_panel2]
286
+ ].drop_duplicates()
287
+ elif selected_panel1 != "N/A":
288
+ unique_combinations = df[[selected_panel1]].drop_duplicates()
289
+ selected_panel2 = None # Ensure Panel_2 is ignored if N/A
290
+ elif selected_panel2 != "N/A":
291
+ unique_combinations = df[[selected_panel2]].drop_duplicates()
292
+ selected_panel1 = None # Ensure Panel_1 is ignored if N/A
293
  else:
294
+ # If both are 'N/A', process the entire dataframe as is
295
+ df = adjust_dataframe_granularity(
296
+ df, file_data["interval"], granularity_selection
297
  )
298
+ files_dict[file_name]["df"] = df
299
+ continue # Skip to the next file
300
+
301
+ transformed_segments = []
302
+ for _, combo in unique_combinations.iterrows():
303
+ if selected_panel1 and selected_panel2:
304
+ segment = df[
305
+ (df[selected_panel1] == combo[selected_panel1])
306
+ & (df[selected_panel2] == combo[selected_panel2])
307
+ ]
308
+ elif selected_panel1:
309
+ segment = df[df[selected_panel1] == combo[selected_panel1]]
310
+ elif selected_panel2:
311
+ segment = df[df[selected_panel2] == combo[selected_panel2]]
312
+
313
+ # Adjust granularity of the segment
314
+ transformed_segment = adjust_dataframe_granularity(
315
+ segment, file_data["interval"], granularity_selection
316
+ )
317
+ transformed_segments.append(transformed_segment)
318
+
319
+ # Combine all transformed segments into a single DataFrame for this file
320
+ transformed_df = pd.concat(transformed_segments, ignore_index=True)
321
+ files_dict[file_name]["df"] = transformed_df
322
+
323
+ return files_dict
324
+
325
+
326
+ # Function to create main dataframe structure
327
+ st.cache_resource(show_spinner=False)
328
+
329
+
330
+ def create_main_dataframe(
331
+ files_dict, all_panel1_values, all_panel2_values, granularity_selection
332
+ ):
333
+ # Determine the global start and end dates across all DataFrames
334
+ global_start = min(df["df"]["date"].min() for df in files_dict.values())
335
+ global_end = max(df["df"]["date"].max() for df in files_dict.values())
336
+
337
+ # Adjust the date_range generation based on the granularity_selection
338
+ if granularity_selection == "weekly":
339
+ # Generate a weekly range, with weeks starting on Monday
340
+ date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
341
+ elif granularity_selection == "monthly":
342
+ # Generate a monthly range, starting from the first day of each month
343
+ date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
344
+ else: # Default to daily if not weekly or monthly
345
+ date_range = pd.date_range(start=global_start, end=global_end, freq="D")
346
+
347
+ # Collect all unique Panel_1 and Panel_2 values, excluding 'N/A'
348
+ all_panel1s = all_panel1_values
349
+ all_panel2s = all_panel2_values
350
+
351
+ # Dynamically build the list of dimensions (Panel_1, Panel_2) to include in the main DataFrame based on availability
352
+ dimensions, merge_keys = [], []
353
+ if all_panel1s:
354
+ dimensions.append(all_panel1s)
355
+ merge_keys.append("Panel_1")
356
+ if all_panel2s:
357
+ dimensions.append(all_panel2s)
358
+ merge_keys.append("Panel_2")
359
+
360
+ dimensions.append(date_range) # Date range is always included
361
+ merge_keys.append("date") # Date range is always included
362
+
363
+ # Create a main DataFrame template with the dimensions
364
+ main_df = pd.MultiIndex.from_product(
365
+ dimensions,
366
+ names=[name for name, _ in zip(merge_keys, dimensions)],
367
+ ).to_frame(index=False)
368
+
369
+ return main_df.reset_index(drop=True)
370
+
371
+
372
+ # Function to prepare and merge dataFrames
373
+ st.cache_resource(show_spinner=False)
374
+
375
+
376
+ def merge_into_main_df(main_df, files_dict, selections):
377
+ for file_name, file_data in files_dict.items():
378
+ df = file_data["df"].copy()
379
+
380
+ # Rename selected Panel_1 and Panel_2 columns if not 'N/A'
381
+ selected_panel1 = selections[file_name].get("Panel_1", "N/A")
382
+ selected_panel2 = selections[file_name].get("Panel_2", "N/A")
383
+ if selected_panel1 != "N/A":
384
+ df.rename(columns={selected_panel1: "Panel_1"}, inplace=True)
385
+ if selected_panel2 != "N/A":
386
+ df.rename(columns={selected_panel2: "Panel_2"}, inplace=True)
387
+
388
+ # Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel_1' and 'Panel_2'
389
+ merge_keys = ["date"]
390
+ if "Panel_1" in df.columns:
391
+ merge_keys.append("Panel_1")
392
+ if "Panel_2" in df.columns:
393
+ merge_keys.append("Panel_2")
394
+ main_df = pd.merge(main_df, df, on=merge_keys, how="left")
395
+
396
+ # After all merges, sort by 'date' and reset index for cleanliness
397
+ sort_by = ["date"]
398
+ if "Panel_1" in main_df.columns:
399
+ sort_by.append("Panel_1")
400
+ if "Panel_2" in main_df.columns:
401
+ sort_by.append("Panel_2")
402
+ main_df.sort_values(by=sort_by, inplace=True)
403
+ main_df.reset_index(drop=True, inplace=True)
404
+
405
+ return main_df
406
+
407
+
408
+ # Function to categorize column
409
+ def categorize_column(column_name):
410
+ # Define keywords for each category
411
+ internal_keywords = [
412
+ "Price",
413
+ "Discount",
414
+ "product_price",
415
+ "cost",
416
+ "margin",
417
+ "inventory",
418
+ "sales",
419
+ "revenue",
420
+ "turnover",
421
+ "expense",
422
+ ]
423
+ exogenous_keywords = [
424
+ "GDP",
425
+ "Tax",
426
+ "Inflation",
427
+ "interest_rate",
428
+ "employment_rate",
429
+ "exchange_rate",
430
+ "consumer_spending",
431
+ "retail_sales",
432
+ "oil_prices",
433
+ "weather",
434
+ ]
435
+
436
+ # Check if the column name matches any of the keywords for Internal or Exogenous categories
437
+ for keyword in internal_keywords:
438
+ if keyword.lower() in column_name.lower():
439
+ return "Internal"
440
+ for keyword in exogenous_keywords:
441
+ if keyword.lower() in column_name.lower():
442
+ return "Exogenous"
443
+
444
+ # Default to Media if no match found
445
+ return "Media"
446
 
447
 
448
  # Function to calculate missing stats and prepare for editable DataFrame
449
+ st.cache_resource(show_spinner=False)
450
+
451
+
452
  def prepare_missing_stats_df(df):
453
  missing_stats = []
454
  for column in df.columns:
455
  if (
456
+ column == "date" or column == "Panel_2" or column == "Panel_1"
457
+ ): # Skip Date, Panel_1 and Panel_2 column
458
  continue
459
 
460
  missing = df[column].isnull().sum()
461
  pct_missing = round((missing / len(df)) * 100, 2)
462
+
463
+ # Dynamically assign category based on column name
464
+ category = categorize_column(column)
465
+ # category = "Media" # Keep default bin as Media
466
+
467
  missing_stats.append(
468
  {
469
  "Column": column,
470
  "Missing Values": missing,
471
  "Missing Percentage": pct_missing,
472
  "Impute Method": "Fill with 0", # Default value
473
+ "Category": category,
474
  }
475
  )
476
  stats_df = pd.DataFrame(missing_stats)
477
+
478
  return stats_df
479
 
480
 
481
+ # Function to add API DataFrame details to the files dictionary
482
+ st.cache_resource(show_spinner=False)
483
+
484
+
485
+ def add_api_dataframe_to_dict(main_df, files_dict):
486
+ files_dict["API"] = {
487
+ "numeric": list(main_df.select_dtypes(include=["number"]).columns),
488
+ "non_numeric": [
489
+ col
490
+ for col in main_df.select_dtypes(exclude=["number"]).columns
491
+ if col.lower() != "date"
492
+ ],
493
+ "interval": determine_data_interval(
494
+ pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
495
+ ),
496
+ "df": main_df,
497
+ }
498
+
499
+ return files_dict
500
+
501
+
502
+ # Function to reads an API into a DataFrame, parsing specified columns as datetime
503
+ @st.cache_resource(show_spinner=False)
504
+ def read_API_data():
505
+ return pd.read_excel(r".\upf_data_converted_randomized_resp_metrics.xlsx", parse_dates=["Date"])
506
+
507
+
508
+ # Function to set the 'Panel_1_Panel_2_Selected' session state variable to False
509
+ def set_Panel_1_Panel_2_Selected_false():
510
+ st.session_state["Panel_1_Panel_2_Selected"] = False
511
+
512
+
513
+ # Function to serialize and save the objects into a pickle file
514
+ @st.cache_resource(show_spinner=False)
515
+ def save_to_pickle(file_path, final_df, bin_dict):
516
+ # Open the file in write-binary mode and dump the objects
517
+ with open(file_path, "wb") as f:
518
+ pickle.dump({"final_df": final_df, "bin_dict": bin_dict}, f)
519
+ # Data is now saved to file
520
+
521
+
522
+ # Function to processes the merged_df DataFrame based on operations defined in edited_df
523
+ @st.cache_resource(show_spinner=False)
524
+ def process_dataframes(merged_df, edited_df, edited_stats_df):
525
+ # Ensure there are operations defined by the user
526
+ if edited_df.empty:
527
+ return merged_df, edited_stats_df # No operations to apply
528
+
529
+ # Perform operations as defined by the user
530
+ for index, row in edited_df.iterrows():
531
+ result_column_name = f"{row['Column 1']}{row['Operator']}{row['Column 2']}"
532
+ col1 = row["Column 1"]
533
+ col2 = row["Column 2"]
534
+ op = row["Operator"]
535
+
536
+ # Apply the specified operation
537
+ if op == "+":
538
+ merged_df[result_column_name] = merged_df[col1] + merged_df[col2]
539
+ elif op == "-":
540
+ merged_df[result_column_name] = merged_df[col1] - merged_df[col2]
541
+ elif op == "*":
542
+ merged_df[result_column_name] = merged_df[col1] * merged_df[col2]
543
+ elif op == "/":
544
+ merged_df[result_column_name] = merged_df[col1] / merged_df[col2].replace(
545
+ 0, 1e-9
546
+ )
547
+
548
+ # Add summary of operation to edited_stats_df
549
+ new_row = {
550
+ "Column": result_column_name,
551
+ "Missing Values": None,
552
+ "Missing Percentage": None,
553
+ "Impute Method": None,
554
+ "Category": row["Category"],
555
+ }
556
+ new_row_df = pd.DataFrame([new_row])
557
+
558
+ # Use pd.concat to add the new_row_df to edited_stats_df
559
+ edited_stats_df = pd.concat(
560
+ [edited_stats_df, new_row_df], ignore_index=True, axis=0
561
+ )
562
+
563
+ # Combine column names from edited_df for cleanup
564
+ combined_columns = set(edited_df["Column 1"]).union(set(edited_df["Column 2"]))
565
+
566
+ # Filter out rows in edited_stats_df and drop columns from merged_df
567
+ edited_stats_df = edited_stats_df[~edited_stats_df["Column"].isin(combined_columns)]
568
+ merged_df.drop(columns=list(combined_columns), errors="ignore", inplace=True)
569
+
570
+ return merged_df, edited_stats_df
571
+
572
+
573
+ # Function to prepare a list of numeric column names and initialize an empty DataFrame with predefined structure
574
+ st.cache_resource(show_spinner=False)
575
+
576
+
577
+ def prepare_numeric_columns_and_default_df(merged_df, edited_stats_df):
578
+ # Get columns categorized as 'Response Metrics'
579
+ columns_response_metrics = edited_stats_df[
580
+ edited_stats_df["Category"] == "Response Metrics"
581
+ ]["Column"].tolist()
582
+
583
+ # Filter numeric columns, excluding those categorized as 'Response Metrics'
584
+ numeric_columns = [
585
+ col
586
+ for col in merged_df.select_dtypes(include=["number"]).columns
587
+ if col not in columns_response_metrics
588
+ ]
589
+
590
+ # Define the structure of the empty DataFrame
591
+ data = {
592
+ "Column 1": pd.Series([], dtype="str"),
593
+ "Operator": pd.Series([], dtype="str"),
594
+ "Column 2": pd.Series([], dtype="str"),
595
+ "Category": pd.Series([], dtype="str"),
596
+ }
597
+ default_df = pd.DataFrame(data)
598
+
599
+ return numeric_columns, default_df
600
+
601
+
602
+ # Initialize 'final_df' in session state
603
+ if "final_df" not in st.session_state:
604
+ st.session_state["final_df"] = pd.DataFrame()
605
+
606
+ # Initialize 'bin_dict' in session state
607
+ if "bin_dict" not in st.session_state:
608
+ st.session_state["bin_dict"] = {}
609
+
610
+ # Initialize 'Panel_1_Panel_2_Selected' in session state
611
+ if "Panel_1_Panel_2_Selected" not in st.session_state:
612
+ st.session_state["Panel_1_Panel_2_Selected"] = False
613
+
614
+
615
+ # Page Title
616
+ st.write("") # Top padding
617
+ st.title("Data Import")
618
+
619
+
620
+ #########################################################################################################################################################
621
+ # Create a dictionary to hold all DataFrames and collect user input to specify "Panel_2" and "Panel_1" columns for each file
622
+ #########################################################################################################################################################
623
+
624
+
625
+ # Read the Excel file, parsing 'Date' column as datetime
626
+ main_df = read_API_data()
627
+
628
+ # Convert all column names to lowercase
629
+ main_df.columns = main_df.columns.str.lower().str.strip()
630
+
631
+ # File uploader
632
+ uploaded_files = st.file_uploader(
633
+ "Upload additional data",
634
+ type=["xlsx"],
635
+ accept_multiple_files=True,
636
+ on_change=set_Panel_1_Panel_2_Selected_false,
637
+ )
638
+
639
+ # Custom HTML for upload instructions
640
+ recommendation_html = f"""
641
+ <div style="text-align: justify;">
642
+ <strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
643
+ </div>
644
+ """
645
+ st.markdown(recommendation_html, unsafe_allow_html=True)
646
+
647
+ # Choose Desired Granularity
648
+ st.markdown("#### Choose Desired Granularity")
649
+ # Granularity Selection
650
+ granularity_selection = st.selectbox(
651
+ "Choose Date Granularity",
652
+ ["Daily", "Weekly", "Monthly"],
653
+ label_visibility="collapsed",
654
+ on_change=set_Panel_1_Panel_2_Selected_false,
655
+ )
656
+ granularity_selection = str(granularity_selection).lower()
657
+
658
+ # Convert files to dataframes
659
+ files_dict = files_to_dataframes(uploaded_files)
660
+
661
+ # Add API Dataframe
662
+ if main_df is not None:
663
+ files_dict = add_api_dataframe_to_dict(main_df, files_dict)
664
+
665
+ # Display a warning message if no files have been uploaded and halt further execution
666
+ if not files_dict:
667
+ st.warning(
668
+ "Please upload at least one file to proceed.",
669
+ icon="⚠️",
670
+ )
671
+ st.stop() # Halts further execution until file is uploaded
672
+
673
+
674
+ # Select Panel_1 and Panel_2 columns
675
+ st.markdown("#### Select Panel columns")
676
+ selections = {}
677
+ with st.expander("Select Panel columns", expanded=False):
678
+ count = 0 # Initialize counter to manage the visibility of labels and keys
679
+ for file_name, file_data in files_dict.items():
680
+ # Determine visibility of the label based on the count
681
+ if count == 0:
682
+ label_visibility = "visible"
683
+ else:
684
+ label_visibility = "collapsed"
685
+
686
+ # Extract non-numeric columns
687
+ non_numeric_cols = file_data["non_numeric"]
688
+
689
+ # Prepare Panel_1 and Panel_2 values for dropdown, adding "N/A" as an option
690
+ panel1_values = non_numeric_cols + ["N/A"]
691
+ panel2_values = non_numeric_cols + ["N/A"]
692
+
693
+ # Skip if only one option is available
694
+ if len(panel1_values) == 1 and len(panel2_values) == 1:
695
+ selected_panel1, selected_panel2 = "N/A", "N/A"
696
+ # Update the selections for Panel_1 and Panel_2 for the current file
697
+ selections[file_name] = {
698
+ "Panel_1": selected_panel1,
699
+ "Panel_2": selected_panel2,
700
+ }
701
+ continue
702
+
703
+ # Create layout columns for File Name, Panel_2, and Panel_1 selections
704
+ file_name_col, Panel_1_col, Panel_2_col = st.columns([2, 4, 4])
705
+
706
+ with file_name_col:
707
+ # Display "File Name" label only for the first file
708
+ if count == 0:
709
+ st.write("File Name")
710
+ else:
711
+ st.write("")
712
+ st.write(file_name) # Display the file name
713
+
714
+ with Panel_1_col:
715
+ # Display a selectbox for Panel_1 values
716
+ selected_panel1 = st.selectbox(
717
+ "Select Panel Level 1",
718
+ panel2_values,
719
+ on_change=set_Panel_1_Panel_2_Selected_false,
720
+ label_visibility=label_visibility, # Control visibility of the label
721
+ key=f"Panel_1_selectbox{count}", # Ensure unique key for each selectbox
722
+ )
723
+
724
+ with Panel_2_col:
725
+ # Display a selectbox for Panel_2 values
726
+ selected_panel2 = st.selectbox(
727
+ "Select Panel Level 2",
728
+ panel1_values,
729
+ on_change=set_Panel_1_Panel_2_Selected_false,
730
+ label_visibility=label_visibility, # Control visibility of the label
731
+ key=f"Panel_2_selectbox{count}", # Ensure unique key for each selectbox
732
+ )
733
+
734
+ # Skip processing if the same column is selected for both Panel_1 and Panel_2 due to potential data integrity issues
735
+ if selected_panel2 == selected_panel1 and not (
736
+ selected_panel2 == "N/A" and selected_panel1 == "N/A"
737
+ ):
738
+ st.warning(
739
+ f"File: {file_name} → The same column cannot serve as both Panel_1 and Panel_2. Please adjust your selections.",
740
+ )
741
+ selected_panel1, selected_panel2 = "N/A", "N/A"
742
+ st.stop()
743
+
744
+ # Update the selections for Panel_1 and Panel_2 for the current file
745
+ selections[file_name] = {
746
+ "Panel_1": selected_panel1,
747
+ "Panel_2": selected_panel2,
748
+ }
749
+
750
+ count += 1 # Increment the counter after processing each file
751
+
752
+ # Accept Panel_1 and Panel_2 selection
753
+ if st.button("Accept and Process", use_container_width=True):
754
+
755
+ # Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
756
+ with st.spinner("Processing..."):
757
+ files_dict = standardize_data_to_daily(files_dict, selections)
758
+
759
+ # Convert all data to daily level granularity
760
+ files_dict = apply_granularity_to_all(
761
+ files_dict, granularity_selection, selections
762
+ )
763
+
764
+ # Update the 'files_dict' in the session state
765
+ st.session_state["files_dict"] = files_dict
766
+
767
+ # Set a flag in the session state to indicate that selection has been made
768
+ st.session_state["Panel_1_Panel_2_Selected"] = True
769
+
770
+
771
+ #########################################################################################################################################################
772
+ # Display unique Panel_1 and Panel_2 values
773
+ #########################################################################################################################################################
774
+
775
+
776
+ # Halts further execution until Panel_1 and Panel_2 columns are selected
777
+ if "files_dict" in st.session_state and st.session_state["Panel_1_Panel_2_Selected"]:
778
+ files_dict = st.session_state["files_dict"]
779
+ else:
780
+ st.stop()
781
+
782
+ # Set to store unique values of Panel_1 and Panel_2
783
+ with st.spinner("Fetching Panel values..."):
784
+ all_panel1_values, all_panel2_values = clean_and_extract_unique_values(
785
+ files_dict, selections
786
+ )
787
+
788
+ # List of Panel_1 and Panel_2 columns unique values
789
+ list_of_all_panel1_values = list(all_panel1_values)
790
+ list_of_all_panel2_values = list(all_panel2_values)
791
+
792
+ # Format Panel_1 and Panel_2 values for display
793
+ formatted_panel1_values = format_values_for_display(list_of_all_panel1_values)
794
+ formatted_panel2_values = format_values_for_display(list_of_all_panel2_values)
795
+
796
+ # Unique Panel_1 and Panel_2 values
797
+ st.markdown("#### Unique Panel values")
798
+ # Display Panel_1 and Panel_2 values
799
+ with st.expander("Unique Panel values"):
800
+ st.write("")
801
+ st.markdown(
802
+ f"""
803
+ <style>
804
+ .justify-text {{
805
+ text-align: justify;
806
+ }}
807
+ </style>
808
+ <div class="justify-text">
809
+ <strong>Panel Level 1 Values:</strong> {formatted_panel1_values}<br>
810
+ <strong>Panel Level 2 Values:</strong> {formatted_panel2_values}
811
+ </div>
812
+ """,
813
+ unsafe_allow_html=True,
814
+ )
815
+
816
+ # Display total Panel_1 and Panel_2
817
+ st.write("")
818
+ st.markdown(
819
+ f"""
820
+ <div style="text-align: justify;">
821
+ <strong>Number of Level 1 Panels detected:</strong> {len(list_of_all_panel1_values)}<br>
822
+ <strong>Number of Level 2 Panels detected:</strong> {len(list_of_all_panel2_values)}
823
+ </div>
824
+ """,
825
+ unsafe_allow_html=True,
826
+ )
827
+ st.write("")
828
+
829
+
830
+ #########################################################################################################################################################
831
+ # Merge all DataFrames
832
+ #########################################################################################################################################################
833
+
834
+
835
+ # Merge all DataFrames selected
836
+ main_df = create_main_dataframe(
837
+ files_dict, all_panel1_values, all_panel2_values, granularity_selection
838
+ )
839
+ merged_df = merge_into_main_df(main_df, files_dict, selections)
840
+
841
+
842
+ #########################################################################################################################################################
843
+ # Categorize Variables and Impute Missing Values
844
+ #########################################################################################################################################################
845
+
846
 
847
  # Create an editable DataFrame in Streamlit
848
  st.markdown("#### Select Variables Category & Impute Missing Values")
849
 
850
+ # Prepare missing stats DataFrame for editing
851
+ missing_stats_df = prepare_missing_stats_df(merged_df)
852
+
853
  edited_stats_df = st.data_editor(
854
  missing_stats_df,
855
  column_config={
 
865
  ),
866
  "Category": st.column_config.SelectboxColumn(
867
  options=[
 
868
  "Media",
869
  "Exogenous",
870
  "Internal",
871
+ "Response Metrics",
 
872
  ],
873
  required=True,
874
  default="Media",
 
879
  use_container_width=True,
880
  )
881
 
 
882
  # Apply changes based on edited DataFrame
883
  for i, row in edited_stats_df.iterrows():
884
  column = row["Column"]
885
  if row["Impute Method"] == "Drop Column":
886
+ merged_df.drop(columns=[column], inplace=True)
887
 
888
  elif row["Impute Method"] == "Fill with Mean":
889
+ merged_df[column].fillna(merged_df[column].mean(), inplace=True)
890
 
891
  elif row["Impute Method"] == "Fill with Median":
892
+ merged_df[column].fillna(merged_df[column].median(), inplace=True)
893
 
894
  elif row["Impute Method"] == "Fill with 0":
895
+ merged_df[column].fillna(0, inplace=True)
896
+
897
+
898
+ #########################################################################################################################################################
899
+ # Group columns
900
+ #########################################################################################################################################################
901
 
902
 
903
+ # Display Group columns header
904
+ st.markdown("#### Feature engineering")
905
 
906
+ # Prepare the numeric columns and an empty DataFrame for user input
907
+ numeric_columns, default_df = prepare_numeric_columns_and_default_df(
908
+ merged_df, edited_stats_df
909
+ )
910
+
911
+ # Display editable Dataframe
912
+ edited_df = st.data_editor(
913
+ default_df,
914
+ column_config={
915
+ "Column 1": st.column_config.SelectboxColumn(
916
+ options=numeric_columns,
917
+ required=True,
918
+ default=numeric_columns[0],
919
+ width=400,
920
+ ),
921
+ "Operator": st.column_config.SelectboxColumn(
922
+ options=["+", "-", "*", "/"],
923
+ required=True,
924
+ default="+",
925
+ width=100,
926
+ ),
927
+ "Column 2": st.column_config.SelectboxColumn(
928
+ options=numeric_columns,
929
+ required=True,
930
+ default=numeric_columns[0],
931
+ width=400,
932
+ ),
933
+ "Category": st.column_config.SelectboxColumn(
934
+ options=[
935
+ "Media",
936
+ "Exogenous",
937
+ "Internal",
938
+ "Response Metrics",
939
+ ],
940
+ required=True,
941
+ default="Media",
942
+ width=200,
943
+ ),
944
+ },
945
+ num_rows="dynamic",
946
+ )
947
+
948
+ # Process the DataFrame based on user inputs and operations specified in edited_df
949
+ final_df, edited_stats_df = process_dataframes(merged_df, edited_df, edited_stats_df)
950
+
951
+
952
+ #########################################################################################################################################################
953
+ # Display the Final DataFrame and variables
954
+ #########################################################################################################################################################
955
+
956
+
957
+ # Display the Final DataFrame and variables
958
+ st.markdown("#### Final DataFrame")
959
+ st.dataframe(final_df, hide_index=True)
960
 
961
  # Initialize an empty dictionary to hold categories and their variables
962
  category_dict = {}
 
974
  # If it exists, append the current column to the list of variables under this category
975
  category_dict[category].append(column)
976
 
977
+ # Add Date, Panel_1 and Panel_12 in category dictionary
978
+ category_dict.update({"Date": ["date"]})
979
+ if "Panel_1" in final_df.columns:
980
+ category_dict["Panel Level 1"] = ["Panel_1"]
981
+ if "Panel_2" in final_df.columns:
982
+ category_dict["Panel Level 2"] = ["Panel_2"]
983
+
984
  # Display the dictionary
985
+ st.markdown("#### Variable Category")
986
  for category, variables in category_dict.items():
987
  # Check if there are multiple variables to handle "and" insertion correctly
988
  if len(variables) > 1:
 
993
  variables_str = variables[0]
994
 
995
  # Display the category and its variables in the desired format
996
+ st.markdown(
997
+ f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
998
+ unsafe_allow_html=True,
999
+ )
1000
+
1001
+ # Function to check if Response Metrics is selected
1002
+ st.write("")
1003
+ response_metrics_col = category_dict.get("Response Metrics", [])
1004
+ if len(response_metrics_col) == 0:
1005
+ st.warning("Please select Response Metrics column", icon="⚠️")
1006
+ st.stop()
1007
+ # elif len(response_metrics_col) > 1:
1008
+ # st.warning("Please select only one Response Metrics column", icon="⚠️")
1009
+ # st.stop()
1010
+
1011
+ # Store final dataframe and bin dictionary into session state
1012
+ st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
1013
+
1014
+ # Save the DataFrame and dictionary from the session state to the pickle file
1015
+ if st.button("Accept and Save", use_container_width=True):
1016
+ save_to_pickle(
1017
+ "data_import.pkl", st.session_state["final_df"], st.session_state["bin_dict"]
1018
+ )
1019
+ st.toast("💾 Saved Successfully!")