BlendMMM commited on
Commit
4043b0d
·
verified ·
1 Parent(s): 646c087

Delete Data_Import.py

Browse files
Files changed (1) hide show
  1. Data_Import.py +0 -851
Data_Import.py DELETED
@@ -1,851 +0,0 @@
1
- # Importing necessary libraries
2
- import streamlit as st
3
-
4
- st.set_page_config(
5
- page_title="Model Build",
6
- page_icon=":shark:",
7
- layout="wide",
8
- initial_sidebar_state="collapsed",
9
- )
10
-
11
- import pickle
12
- import numpy as np
13
- import pandas as pd
14
- from utilities import set_header, load_local_css, load_authenticator
15
-
16
- load_local_css("styles.css")
17
- set_header()
18
-
19
- authenticator = st.session_state.get("authenticator")
20
- if authenticator is None:
21
- authenticator = load_authenticator()
22
-
23
- name, authentication_status, username = authenticator.login("Login", "main")
24
- auth_status = st.session_state.get("authentication_status")
25
-
26
- # Check for authentication status
27
- if auth_status != True:
28
- st.stop()
29
-
30
-
31
- # Function to validate date column in dataframe
32
- def validate_date_column(df):
33
- try:
34
- # Attempt to convert the 'Date' column to datetime
35
- df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
36
- return True
37
- except:
38
- return False
39
-
40
-
41
- # Function to determine data interval
42
- def determine_data_interval(common_freq):
43
- if common_freq == 1:
44
- return "daily"
45
- elif common_freq == 7:
46
- return "weekly"
47
- elif 28 <= common_freq <= 31:
48
- return "monthly"
49
- else:
50
- return "irregular"
51
-
52
-
53
- # Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
54
- st.cache_resource(show_spinner=False)
55
-
56
-
57
- def files_to_dataframes(uploaded_files):
58
- df_dict = {}
59
- for uploaded_file in uploaded_files:
60
- # Extract file name without extension
61
- file_name = uploaded_file.name.rsplit(".", 1)[0]
62
-
63
- # Check for duplicate file names
64
- if file_name in df_dict:
65
- st.warning(
66
- f"Duplicate File: {file_name}. This file will be skipped.",
67
- icon="⚠️",
68
- )
69
- continue
70
-
71
- # Read the file into a DataFrame
72
- df = pd.read_excel(uploaded_file)
73
-
74
- # Convert all column names to lowercase
75
- df.columns = df.columns.str.lower().str.strip()
76
-
77
- # Separate numeric and non-numeric columns
78
- numeric_cols = list(df.select_dtypes(include=["number"]).columns)
79
- non_numeric_cols = [
80
- col
81
- for col in df.select_dtypes(exclude=["number"]).columns
82
- if col.lower() != "date"
83
- ]
84
-
85
- # Check for 'Date' column
86
- if not (validate_date_column(df) and len(numeric_cols) > 0):
87
- st.warning(
88
- f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
89
- icon="⚠️",
90
- )
91
- continue
92
-
93
- # Check for interval
94
- common_freq = common_freq = (
95
- pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
96
- )
97
- # Calculate the data interval (daily, weekly, monthly or irregular)
98
- interval = determine_data_interval(common_freq)
99
- if interval == "irregular":
100
- st.warning(
101
- f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
102
- icon="⚠️",
103
- )
104
- continue
105
-
106
- # Store both DataFrames in the dictionary under their respective keys
107
- df_dict[file_name] = {
108
- "numeric": numeric_cols,
109
- "non_numeric": non_numeric_cols,
110
- "interval": interval,
111
- "df": df,
112
- }
113
-
114
- return df_dict
115
-
116
-
117
- # Function to adjust dataframe granularity
118
- def adjust_dataframe_granularity(df, current_granularity, target_granularity):
119
- # Set index
120
- df.set_index("date", inplace=True)
121
-
122
- # Define aggregation rules for resampling
123
- aggregation_rules = {
124
- col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
125
- for col in df.columns
126
- }
127
-
128
- # Initialize resampled_df
129
- resampled_df = df
130
- if current_granularity == "daily" and target_granularity == "weekly":
131
- resampled_df = df.resample("W-MON", closed="left", label="left").agg(
132
- aggregation_rules
133
- )
134
-
135
- elif current_granularity == "daily" and target_granularity == "monthly":
136
- resampled_df = df.resample("MS", closed="left", label="left").agg(
137
- aggregation_rules
138
- )
139
-
140
- elif current_granularity == "daily" and target_granularity == "daily":
141
- resampled_df = df.resample("D").agg(aggregation_rules)
142
-
143
- elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
144
- # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
145
- expanded_data = []
146
- for _, row in df.iterrows():
147
- if current_granularity == "weekly":
148
- period_range = pd.date_range(start=row.name, periods=7)
149
- elif current_granularity == "monthly":
150
- period_range = pd.date_range(
151
- start=row.name, periods=row.name.days_in_month
152
- )
153
-
154
- for date in period_range:
155
- new_row = {}
156
- for col in df.columns:
157
- if pd.api.types.is_numeric_dtype(df[col]):
158
- if current_granularity == "weekly":
159
- new_row[col] = row[col] / 7
160
- elif current_granularity == "monthly":
161
- new_row[col] = row[col] / row.name.days_in_month
162
- else:
163
- new_row[col] = row[col]
164
- expanded_data.append((date, new_row))
165
-
166
- resampled_df = pd.DataFrame(
167
- [data for _, data in expanded_data],
168
- index=[date for date, _ in expanded_data],
169
- )
170
-
171
- # Reset index
172
- resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
173
-
174
- return resampled_df
175
-
176
-
177
- # Function to clean and extract unique values of Panel_1 and Panel_2
178
- st.cache_resource(show_spinner=False)
179
-
180
-
181
- def clean_and_extract_unique_values(files_dict, selections):
182
- all_panel1_values = set()
183
- all_panel2_values = set()
184
-
185
- for file_name, file_data in files_dict.items():
186
- df = file_data["df"]
187
-
188
- # 'Panel_1' and 'Panel_2' selections
189
- selected_panel1 = selections[file_name].get("Panel_1")
190
- selected_panel2 = selections[file_name].get("Panel_2")
191
-
192
- # Clean and standardize Panel_1 column if it exists and is selected
193
- if (
194
- selected_panel1
195
- and selected_panel1 != "N/A"
196
- and selected_panel1 in df.columns
197
- ):
198
- df[selected_panel1] = (
199
- df[selected_panel1].str.lower().str.strip().str.replace("_", " ")
200
- )
201
- all_panel1_values.update(df[selected_panel1].dropna().unique())
202
-
203
- # Clean and standardize Panel_2 column if it exists and is selected
204
- if (
205
- selected_panel2
206
- and selected_panel2 != "N/A"
207
- and selected_panel2 in df.columns
208
- ):
209
- df[selected_panel2] = (
210
- df[selected_panel2].str.lower().str.strip().str.replace("_", " ")
211
- )
212
- all_panel2_values.update(df[selected_panel2].dropna().unique())
213
-
214
- # Update the processed DataFrame back in the dictionary
215
- files_dict[file_name]["df"] = df
216
-
217
- return all_panel1_values, all_panel2_values
218
-
219
-
220
- # Function to format values for display
221
- st.cache_resource(show_spinner=False)
222
-
223
-
224
- def format_values_for_display(values_list):
225
- # Capitalize the first letter of each word and replace underscores with spaces
226
- formatted_list = [value.replace("_", " ").title() for value in values_list]
227
- # Join values with commas and 'and' before the last value
228
- if len(formatted_list) > 1:
229
- return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
230
- elif formatted_list:
231
- return formatted_list[0]
232
- return "No values available"
233
-
234
-
235
- # Function to normalizes all data within files_dict to a daily granularity
236
- st.cache(show_spinner=False, allow_output_mutation=True)
237
-
238
-
239
- def standardize_data_to_daily(files_dict, selections):
240
- # Normalize all data to a daily granularity using a provided function
241
- files_dict = apply_granularity_to_all(files_dict, "daily", selections)
242
-
243
- # Update the "interval" attribute for each dataset to indicate the new granularity
244
- for files_name, files_data in files_dict.items():
245
- files_data["interval"] = "daily"
246
-
247
- return files_dict
248
-
249
-
250
- # Function to apply granularity transformation to all DataFrames in files_dict
251
- st.cache_resource(show_spinner=False)
252
-
253
-
254
- def apply_granularity_to_all(files_dict, granularity_selection, selections):
255
- for file_name, file_data in files_dict.items():
256
- df = file_data["df"].copy()
257
-
258
- # Handling when Panel_1 or Panel_2 might be 'N/A'
259
- selected_panel1 = selections[file_name].get("Panel_1")
260
- selected_panel2 = selections[file_name].get("Panel_2")
261
-
262
- # Correcting the segment selection logic & handling 'N/A'
263
- if selected_panel1 != "N/A" and selected_panel2 != "N/A":
264
- unique_combinations = df[
265
- [selected_panel1, selected_panel2]
266
- ].drop_duplicates()
267
- elif selected_panel1 != "N/A":
268
- unique_combinations = df[[selected_panel1]].drop_duplicates()
269
- selected_panel2 = None # Ensure Panel_2 is ignored if N/A
270
- elif selected_panel2 != "N/A":
271
- unique_combinations = df[[selected_panel2]].drop_duplicates()
272
- selected_panel1 = None # Ensure Panel_1 is ignored if N/A
273
- else:
274
- # If both are 'N/A', process the entire dataframe as is
275
- df = adjust_dataframe_granularity(
276
- df, file_data["interval"], granularity_selection
277
- )
278
- files_dict[file_name]["df"] = df
279
- continue # Skip to the next file
280
-
281
- transformed_segments = []
282
- for _, combo in unique_combinations.iterrows():
283
- if selected_panel1 and selected_panel2:
284
- segment = df[
285
- (df[selected_panel1] == combo[selected_panel1])
286
- & (df[selected_panel2] == combo[selected_panel2])
287
- ]
288
- elif selected_panel1:
289
- segment = df[df[selected_panel1] == combo[selected_panel1]]
290
- elif selected_panel2:
291
- segment = df[df[selected_panel2] == combo[selected_panel2]]
292
-
293
- # Adjust granularity of the segment
294
- transformed_segment = adjust_dataframe_granularity(
295
- segment, file_data["interval"], granularity_selection
296
- )
297
- transformed_segments.append(transformed_segment)
298
-
299
- # Combine all transformed segments into a single DataFrame for this file
300
- transformed_df = pd.concat(transformed_segments, ignore_index=True)
301
- files_dict[file_name]["df"] = transformed_df
302
-
303
- return files_dict
304
-
305
-
306
- # Function to create main dataframe structure
307
- st.cache_resource(show_spinner=False)
308
-
309
-
310
- def create_main_dataframe(
311
- files_dict, all_panel1_values, all_panel2_values, granularity_selection
312
- ):
313
- # Determine the global start and end dates across all DataFrames
314
- global_start = min(df["df"]["date"].min() for df in files_dict.values())
315
- global_end = max(df["df"]["date"].max() for df in files_dict.values())
316
-
317
- # Adjust the date_range generation based on the granularity_selection
318
- if granularity_selection == "weekly":
319
- # Generate a weekly range, with weeks starting on Monday
320
- date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
321
- elif granularity_selection == "monthly":
322
- # Generate a monthly range, starting from the first day of each month
323
- date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
324
- else: # Default to daily if not weekly or monthly
325
- date_range = pd.date_range(start=global_start, end=global_end, freq="D")
326
-
327
- # Collect all unique Panel_1 and Panel_2 values, excluding 'N/A'
328
- all_panel1s = all_panel1_values
329
- all_panel2s = all_panel2_values
330
-
331
- # Dynamically build the list of dimensions (Panel_1, Panel_2) to include in the main DataFrame based on availability
332
- dimensions, merge_keys = [], []
333
- if all_panel1s:
334
- dimensions.append(all_panel1s)
335
- merge_keys.append("Panel_1")
336
- if all_panel2s:
337
- dimensions.append(all_panel2s)
338
- merge_keys.append("Panel_2")
339
-
340
- dimensions.append(date_range) # Date range is always included
341
- merge_keys.append("date") # Date range is always included
342
-
343
- # Create a main DataFrame template with the dimensions
344
- main_df = pd.MultiIndex.from_product(
345
- dimensions,
346
- names=[name for name, _ in zip(merge_keys, dimensions)],
347
- ).to_frame(index=False)
348
-
349
- return main_df.reset_index(drop=True)
350
-
351
-
352
- # Function to prepare and merge dataFrames
353
- st.cache_resource(show_spinner=False)
354
-
355
-
356
- def merge_into_main_df(main_df, files_dict, selections):
357
- for file_name, file_data in files_dict.items():
358
- df = file_data["df"].copy()
359
-
360
- # Rename selected Panel_1 and Panel_2 columns if not 'N/A'
361
- selected_panel1 = selections[file_name].get("Panel_1", "N/A")
362
- selected_panel2 = selections[file_name].get("Panel_2", "N/A")
363
- if selected_panel1 != "N/A":
364
- df.rename(columns={selected_panel1: "Panel_1"}, inplace=True)
365
- if selected_panel2 != "N/A":
366
- df.rename(columns={selected_panel2: "Panel_2"}, inplace=True)
367
-
368
- # Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel_1' and 'Panel_2'
369
- merge_keys = ["date"]
370
- if "Panel_1" in df.columns:
371
- merge_keys.append("Panel_1")
372
- if "Panel_2" in df.columns:
373
- merge_keys.append("Panel_2")
374
- main_df = pd.merge(main_df, df, on=merge_keys, how="left")
375
-
376
- # After all merges, sort by 'date' and reset index for cleanliness
377
- sort_by = ["date"]
378
- if "Panel_1" in main_df.columns:
379
- sort_by.append("Panel_1")
380
- if "Panel_2" in main_df.columns:
381
- sort_by.append("Panel_2")
382
- main_df.sort_values(by=sort_by, inplace=True)
383
- main_df.reset_index(drop=True, inplace=True)
384
-
385
- return main_df
386
-
387
-
388
- # Function to categorize column
389
- def categorize_column(column_name):
390
- # Define keywords for each category
391
- internal_keywords = [
392
- "Price",
393
- "Discount",
394
- "product_price",
395
- "cost",
396
- "margin",
397
- "inventory",
398
- "sales",
399
- "revenue",
400
- "turnover",
401
- "expense",
402
- ]
403
- exogenous_keywords = [
404
- "GDP",
405
- "Tax",
406
- "Inflation",
407
- "interest_rate",
408
- "employment_rate",
409
- "exchange_rate",
410
- "consumer_spending",
411
- "retail_sales",
412
- "oil_prices",
413
- "weather",
414
- ]
415
-
416
- # Check if the column name matches any of the keywords for Internal or Exogenous categories
417
- for keyword in internal_keywords:
418
- if keyword.lower() in column_name.lower():
419
- return "Internal"
420
- for keyword in exogenous_keywords:
421
- if keyword.lower() in column_name.lower():
422
- return "Exogenous"
423
-
424
- # Default to Media if no match found
425
- return "Media"
426
-
427
-
428
- # Function to calculate missing stats and prepare for editable DataFrame
429
- st.cache_resource(show_spinner=False)
430
-
431
-
432
- def prepare_missing_stats_df(df):
433
- missing_stats = []
434
- for column in df.columns:
435
- if (
436
- column == "date" or column == "Panel_2" or column == "Panel_1"
437
- ): # Skip Date, Panel_1 and Panel_2 column
438
- continue
439
-
440
- missing = df[column].isnull().sum()
441
- pct_missing = round((missing / len(df)) * 100, 2)
442
-
443
- # Dynamically assign category based on column name
444
- category = categorize_column(column)
445
- # category = "Media" # Keep default bin as Media
446
-
447
- missing_stats.append(
448
- {
449
- "Column": column,
450
- "Missing Values": missing,
451
- "Missing Percentage": pct_missing,
452
- "Impute Method": "Fill with 0", # Default value
453
- "Category": category,
454
- }
455
- )
456
- stats_df = pd.DataFrame(missing_stats)
457
-
458
- return stats_df
459
-
460
-
461
- # Function to add API DataFrame details to the files dictionary
462
- st.cache_resource(show_spinner=False)
463
-
464
-
465
- def add_api_dataframe_to_dict(main_df, files_dict):
466
- files_dict["API"] = {
467
- "numeric": list(main_df.select_dtypes(include=["number"]).columns),
468
- "non_numeric": [
469
- col
470
- for col in main_df.select_dtypes(exclude=["number"]).columns
471
- if col.lower() != "date"
472
- ],
473
- "interval": determine_data_interval(
474
- pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
475
- ),
476
- "df": main_df,
477
- }
478
-
479
- return files_dict
480
-
481
-
482
- # Function to reads an API into a DataFrame, parsing specified columns as datetime
483
- @st.cache_resource(show_spinner=False)
484
- def read_API_data():
485
- return pd.read_excel("upf_data_converted.xlsx", parse_dates=["Date"])
486
-
487
-
488
- # Function to set the 'Panel_1_Panel_2_Selected' session state variable to False
489
- def set_Panel_1_Panel_2_Selected_false():
490
- st.session_state["Panel_1_Panel_2_Selected"] = False
491
-
492
-
493
- # Function to serialize and save the objects into a pickle file
494
- @st.cache_resource(show_spinner=False)
495
- def save_to_pickle(file_path, final_df, bin_dict):
496
- # Open the file in write-binary mode and dump the objects
497
- with open(file_path, "wb") as f:
498
- pickle.dump({"final_df": final_df, "bin_dict": bin_dict}, f)
499
- # Data is now saved to file
500
-
501
-
502
- # Initialize 'final_df' in session state
503
- if "final_df" not in st.session_state:
504
- st.session_state["final_df"] = pd.DataFrame()
505
-
506
- # Initialize 'bin_dict' in session state
507
- if "bin_dict" not in st.session_state:
508
- st.session_state["bin_dict"] = {}
509
-
510
- # Initialize 'Panel_1_Panel_2_Selected' in session state
511
- if "Panel_1_Panel_2_Selected" not in st.session_state:
512
- st.session_state["Panel_1_Panel_2_Selected"] = False
513
-
514
- # Page Title
515
- st.write("") # Top padding
516
- st.title("Data Import")
517
-
518
-
519
- #########################################################################################################################################################
520
- # Create a dictionary to hold all DataFrames and collect user input to specify "Panel_2" and "Panel_1" columns for each file
521
- #########################################################################################################################################################
522
-
523
-
524
- # Read the Excel file, parsing 'Date' column as datetime
525
- main_df = read_API_data()
526
-
527
- # Convert all column names to lowercase
528
- main_df.columns = main_df.columns.str.lower().str.strip()
529
-
530
- # File uploader
531
- uploaded_files = st.file_uploader(
532
- "Upload additional data",
533
- type=["xlsx"],
534
- accept_multiple_files=True,
535
- on_change=set_Panel_1_Panel_2_Selected_false,
536
- )
537
-
538
- # Custom HTML for upload instructions
539
- recommendation_html = f"""
540
- <div style="text-align: justify;">
541
- <strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
542
- </div>
543
- """
544
- st.markdown(recommendation_html, unsafe_allow_html=True)
545
-
546
- # Choose Desired Granularity
547
- st.markdown("#### Choose Desired Granularity")
548
- # Granularity Selection
549
- granularity_selection = st.selectbox(
550
- "Choose Date Granularity",
551
- ["Daily", "Weekly", "Monthly"],
552
- label_visibility="collapsed",
553
- on_change=set_Panel_1_Panel_2_Selected_false,
554
- )
555
- granularity_selection = str(granularity_selection).lower()
556
-
557
- # Convert files to dataframes
558
- files_dict = files_to_dataframes(uploaded_files)
559
-
560
- # Add API Dataframe
561
- if main_df is not None:
562
- files_dict = add_api_dataframe_to_dict(main_df, files_dict)
563
-
564
- # Display a warning message if no files have been uploaded and halt further execution
565
- if not files_dict:
566
- st.warning(
567
- "Please upload at least one file to proceed.",
568
- icon="⚠️",
569
- )
570
- st.stop() # Halts further execution until file is uploaded
571
-
572
-
573
- # Select Panel_1 and Panel_2 columns
574
- st.markdown("#### Select Panel columns")
575
- selections = {}
576
- with st.expander("Select Panel columns", expanded=False):
577
- count = 0 # Initialize counter to manage the visibility of labels and keys
578
- for file_name, file_data in files_dict.items():
579
- # Determine visibility of the label based on the count
580
- if count == 0:
581
- label_visibility = "visible"
582
- else:
583
- label_visibility = "collapsed"
584
-
585
- # Extract non-numeric columns
586
- non_numeric_cols = file_data["non_numeric"]
587
-
588
- # Prepare Panel_1 and Panel_2 values for dropdown, adding "N/A" as an option
589
- panel1_values = non_numeric_cols + ["N/A"]
590
- panel2_values = non_numeric_cols + ["N/A"]
591
-
592
- # Skip if only one option is available
593
- if len(panel1_values) == 1 and len(panel2_values) == 1:
594
- selected_panel1, selected_panel2 = "N/A", "N/A"
595
- # Update the selections for Panel_1 and Panel_2 for the current file
596
- selections[file_name] = {
597
- "Panel_1": selected_panel1,
598
- "Panel_2": selected_panel2,
599
- }
600
- continue
601
-
602
- # Create layout columns for File Name, Panel_2, and Panel_1 selections
603
- file_name_col, Panel_1_col, Panel_2_col = st.columns([2, 4, 4])
604
-
605
- with file_name_col:
606
- # Display "File Name" label only for the first file
607
- if count == 0:
608
- st.write("File Name")
609
- else:
610
- st.write("")
611
- st.write(file_name) # Display the file name
612
-
613
- with Panel_1_col:
614
- # Display a selectbox for Panel_1 values
615
- selected_panel1 = st.selectbox(
616
- "Select Panel Level 1",
617
- panel2_values,
618
- on_change=set_Panel_1_Panel_2_Selected_false,
619
- label_visibility=label_visibility, # Control visibility of the label
620
- key=f"Panel_1_selectbox{count}", # Ensure unique key for each selectbox
621
- )
622
-
623
- with Panel_2_col:
624
- # Display a selectbox for Panel_2 values
625
- selected_panel2 = st.selectbox(
626
- "Select Panel Level 2",
627
- panel1_values,
628
- on_change=set_Panel_1_Panel_2_Selected_false,
629
- label_visibility=label_visibility, # Control visibility of the label
630
- key=f"Panel_2_selectbox{count}", # Ensure unique key for each selectbox
631
- )
632
-
633
- # Skip processing if the same column is selected for both Panel_1 and Panel_2 due to potential data integrity issues
634
- if selected_panel2 == selected_panel1 and not (
635
- selected_panel2 == "N/A" and selected_panel1 == "N/A"
636
- ):
637
- st.warning(
638
- f"File: {file_name} → The same column cannot serve as both Panel_1 and Panel_2. Please adjust your selections.",
639
- )
640
- selected_panel1, selected_panel2 = "N/A", "N/A"
641
- st.stop()
642
-
643
- # Update the selections for Panel_1 and Panel_2 for the current file
644
- selections[file_name] = {
645
- "Panel_1": selected_panel1,
646
- "Panel_2": selected_panel2,
647
- }
648
-
649
- count += 1 # Increment the counter after processing each file
650
-
651
- # Accept Panel_1 and Panel_2 selection
652
- if st.button("Accept and Process", use_container_width=True):
653
-
654
- # Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
655
- with st.spinner("Processing..."):
656
- files_dict = standardize_data_to_daily(files_dict, selections)
657
-
658
- # Convert all data to daily level granularity
659
- files_dict = apply_granularity_to_all(
660
- files_dict, granularity_selection, selections
661
- )
662
-
663
- st.session_state["files_dict"] = files_dict
664
- st.session_state["Panel_1_Panel_2_Selected"] = True
665
-
666
-
667
- #########################################################################################################################################################
668
- # Display unique Panel_1 and Panel_2 values
669
- #########################################################################################################################################################
670
-
671
-
672
- # Halts further execution until Panel_1 and Panel_2 columns are selected
673
- if "files_dict" in st.session_state and st.session_state["Panel_1_Panel_2_Selected"]:
674
- files_dict = st.session_state["files_dict"]
675
- else:
676
- st.stop()
677
-
678
- # Set to store unique values of Panel_1 and Panel_2
679
- with st.spinner("Fetching Panel values..."):
680
- all_panel1_values, all_panel2_values = clean_and_extract_unique_values(
681
- files_dict, selections
682
- )
683
-
684
- # List of Panel_1 and Panel_2 columns unique values
685
- list_of_all_panel1_values = list(all_panel1_values)
686
- list_of_all_panel2_values = list(all_panel2_values)
687
-
688
- # Format Panel_1 and Panel_2 values for display
689
- formatted_panel1_values = format_values_for_display(list_of_all_panel1_values)
690
- formatted_panel2_values = format_values_for_display(list_of_all_panel2_values)
691
-
692
- # Unique Panel_1 and Panel_2 values
693
- st.markdown("#### Unique Panel values")
694
- # Display Panel_1 and Panel_2 values
695
- with st.expander("Unique Panel values"):
696
- st.write("")
697
- st.markdown(
698
- f"""
699
- <style>
700
- .justify-text {{
701
- text-align: justify;
702
- }}
703
- </style>
704
- <div class="justify-text">
705
- <strong>Panel Level 1 Values:</strong> {formatted_panel1_values}<br>
706
- <strong>Panel Level 2 Values:</strong> {formatted_panel2_values}
707
- </div>
708
- """,
709
- unsafe_allow_html=True,
710
- )
711
-
712
- # Display total Panel_1 and Panel_2
713
- st.write("")
714
- st.markdown(
715
- f"""
716
- <div style="text-align: justify;">
717
- <strong>Number of Level 1 Panels detected:</strong> {len(list_of_all_panel1_values)}<br>
718
- <strong>Number of Level 2 Panels detected:</strong> {len(list_of_all_panel2_values)}
719
- </div>
720
- """,
721
- unsafe_allow_html=True,
722
- )
723
- st.write("")
724
-
725
-
726
- #########################################################################################################################################################
727
- # Merge all DataFrames
728
- #########################################################################################################################################################
729
-
730
-
731
- # Merge all DataFrames selected
732
- main_df = create_main_dataframe(
733
- files_dict, all_panel1_values, all_panel2_values, granularity_selection
734
- )
735
- merged_df = merge_into_main_df(main_df, files_dict, selections)
736
-
737
- # # Display the merged DataFrame
738
- # st.markdown("#### Merged DataFrame based on selected Panel_1 and Panel_2")
739
- # st.dataframe(merged_df)
740
-
741
-
742
- #########################################################################################################################################################
743
- # Categorize Variables and Impute Missing Values
744
- #########################################################################################################################################################
745
-
746
-
747
- # Create an editable DataFrame in Streamlit
748
- st.markdown("#### Select Variables Category & Impute Missing Values")
749
-
750
- # Prepare missing stats DataFrame for editing
751
- missing_stats_df = prepare_missing_stats_df(merged_df)
752
-
753
- edited_stats_df = st.data_editor(
754
- missing_stats_df,
755
- column_config={
756
- "Impute Method": st.column_config.SelectboxColumn(
757
- options=[
758
- "Drop Column",
759
- "Fill with Mean",
760
- "Fill with Median",
761
- "Fill with 0",
762
- ],
763
- required=True,
764
- default="Fill with 0",
765
- ),
766
- "Category": st.column_config.SelectboxColumn(
767
- options=[
768
- "Media",
769
- "Exogenous",
770
- "Internal",
771
- "Response Metrics",
772
- ],
773
- required=True,
774
- default="Media",
775
- ),
776
- },
777
- disabled=["Column", "Missing Values", "Missing Percentage"],
778
- hide_index=True,
779
- use_container_width=True,
780
- )
781
-
782
- # Apply changes based on edited DataFrame
783
- for i, row in edited_stats_df.iterrows():
784
- column = row["Column"]
785
- if row["Impute Method"] == "Drop Column":
786
- merged_df.drop(columns=[column], inplace=True)
787
-
788
- elif row["Impute Method"] == "Fill with Mean":
789
- merged_df[column].fillna(merged_df[column].mean(), inplace=True)
790
-
791
- elif row["Impute Method"] == "Fill with Median":
792
- merged_df[column].fillna(merged_df[column].median(), inplace=True)
793
-
794
- elif row["Impute Method"] == "Fill with 0":
795
- merged_df[column].fillna(0, inplace=True)
796
-
797
- # Display the Final DataFrame and exogenous variables
798
- st.markdown("#### Final DataFrame")
799
- final_df = merged_df
800
- st.dataframe(final_df, hide_index=True)
801
-
802
- # Initialize an empty dictionary to hold categories and their variables
803
- category_dict = {}
804
-
805
- # Iterate over each row in the edited DataFrame to populate the dictionary
806
- for i, row in edited_stats_df.iterrows():
807
- column = row["Column"]
808
- category = row["Category"] # The category chosen by the user for this variable
809
-
810
- # Check if the category already exists in the dictionary
811
- if category not in category_dict:
812
- # If not, initialize it with the current column as its first element
813
- category_dict[category] = [column]
814
- else:
815
- # If it exists, append the current column to the list of variables under this category
816
- category_dict[category].append(column)
817
-
818
- # Add Date, Panel_1 and Panel_12 in category dictionary
819
- category_dict.update({"Date": ["date"]})
820
- if "Panel_1" in final_df.columns:
821
- category_dict["Panel Level 1"] = ["Panel_1"]
822
- if "Panel_2" in final_df.columns:
823
- category_dict["Panel Level 2"] = ["Panel_2"]
824
-
825
- # Display the dictionary
826
- st.markdown("#### Variable Category")
827
- for category, variables in category_dict.items():
828
- # Check if there are multiple variables to handle "and" insertion correctly
829
- if len(variables) > 1:
830
- # Join all but the last variable with ", ", then add " and " before the last variable
831
- variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
832
- else:
833
- # If there's only one variable, no need for "and"
834
- variables_str = variables[0]
835
-
836
- # Display the category and its variables in the desired format
837
- st.markdown(
838
- f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
839
- unsafe_allow_html=True,
840
- )
841
-
842
- # Store final dataframe and bin dictionary into session state
843
- st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
844
-
845
- # Save the DataFrame and dictionary from the session state to the pickle file
846
- st.write("")
847
- if st.button("Accept and Save", use_container_width=True):
848
- save_to_pickle(
849
- "data_import.pkl", st.session_state["final_df"], st.session_state["bin_dict"]
850
- )
851
- st.toast("💾 Saved Successfully!")