BlendMMM commited on
Commit
56ff7b3
·
verified ·
1 Parent(s): 5d7d0c6

Delete pages/1_Data_Import 2.py

Browse files
Files changed (1) hide show
  1. pages/1_Data_Import 2.py +0 -891
pages/1_Data_Import 2.py DELETED
@@ -1,891 +0,0 @@
1
- # Importing necessary libraries
2
- import streamlit as st
3
-
4
- st.set_page_config(
5
- page_title="Model Build",
6
- page_icon=":shark:",
7
- layout="wide",
8
- initial_sidebar_state="collapsed",
9
- )
10
-
11
- import numpy as np
12
- import pandas as pd
13
- from utilities import set_header, load_local_css, load_authenticator
14
- import pickle
15
-
16
-
17
- load_local_css("styles.css")
18
- set_header()
19
-
20
- authenticator = st.session_state.get("authenticator")
21
- if authenticator is None:
22
- authenticator = load_authenticator()
23
-
24
- name, authentication_status, username = authenticator.login("Login", "main")
25
- auth_status = st.session_state.get("authentication_status")
26
-
27
- # Check for authentication status
28
- if auth_status != True:
29
- st.stop()
30
-
31
-
32
- # Function to validate date column in dataframe
33
- def validate_date_column(df):
34
- try:
35
- # Attempt to convert the 'Date' column to datetime
36
- df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
37
- return True
38
- except:
39
- return False
40
-
41
-
42
- # Function to determine data interval
43
- def determine_data_interval(common_freq):
44
- if common_freq == 1:
45
- return "daily"
46
- elif common_freq == 7:
47
- return "weekly"
48
- elif 28 <= common_freq <= 31:
49
- return "monthly"
50
- else:
51
- return "irregular"
52
-
53
-
54
- # Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
55
- st.cache_resource(show_spinner=False)
56
-
57
-
58
- def files_to_dataframes(uploaded_files):
59
- df_dict = {}
60
- for uploaded_file in uploaded_files:
61
- # Extract file name without extension
62
- file_name = uploaded_file.name.rsplit(".", 1)[0]
63
-
64
- # Check for duplicate file names
65
- if file_name in df_dict:
66
- st.warning(
67
- f"Duplicate File: {file_name}. This file will be skipped.",
68
- icon="⚠️",
69
- )
70
- continue
71
-
72
- # Read the file into a DataFrame
73
- df = pd.read_excel(uploaded_file)
74
-
75
- # Convert all column names to lowercase
76
- df.columns = df.columns.str.lower().str.strip()
77
-
78
- # Separate numeric and non-numeric columns
79
- numeric_cols = list(df.select_dtypes(include=["number"]).columns)
80
- non_numeric_cols = [
81
- col
82
- for col in df.select_dtypes(exclude=["number"]).columns
83
- if col.lower() != "date"
84
- ]
85
-
86
- # Check for 'Date' column
87
- if not (validate_date_column(df) and len(numeric_cols) > 0):
88
- st.warning(
89
- f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
90
- icon="⚠️",
91
- )
92
- continue
93
-
94
- # Check for interval
95
- common_freq = common_freq = (
96
- pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
97
- )
98
- # Calculate the data interval (daily, weekly, monthly or irregular)
99
- interval = determine_data_interval(common_freq)
100
- if interval == "irregular":
101
- st.warning(
102
- f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
103
- icon="⚠️",
104
- )
105
- continue
106
-
107
- # Store both DataFrames in the dictionary under their respective keys
108
- df_dict[file_name] = {
109
- "numeric": numeric_cols,
110
- "non_numeric": non_numeric_cols,
111
- "interval": interval,
112
- "df": df,
113
- }
114
-
115
- return df_dict
116
-
117
-
118
- # Function to adjust dataframe granularity
119
- # def adjust_dataframe_granularity(df, current_granularity, target_granularity):
120
- # # Set index
121
- # df.set_index("date", inplace=True)
122
-
123
- # # Define aggregation rules for resampling
124
- # aggregation_rules = {
125
- # col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
126
- # for col in df.columns
127
- # }
128
-
129
- # resampled_df = df
130
- # if current_granularity == "daily" and target_granularity == "weekly":
131
- # resampled_df = df.resample("W-MON").agg(aggregation_rules)
132
-
133
- # elif current_granularity == "daily" and target_granularity == "monthly":
134
- # resampled_df = df.resample("MS").agg(aggregation_rules)
135
-
136
- # elif current_granularity == "daily" and target_granularity == "daily":
137
- # resampled_df = df.resample("D").agg(aggregation_rules)
138
-
139
- # elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
140
- # # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
141
- # expanded_data = []
142
- # for _, row in df.iterrows():
143
- # if current_granularity == "weekly":
144
- # period_range = pd.date_range(start=row.name, periods=7)
145
- # elif current_granularity == "monthly":
146
- # period_range = pd.date_range(
147
- # start=row.name, periods=row.name.days_in_month
148
- # )
149
-
150
- # for date in period_range:
151
- # new_row = {}
152
- # for col in df.columns:
153
- # if pd.api.types.is_numeric_dtype(df[col]):
154
- # if current_granularity == "weekly":
155
- # new_row[col] = row[col] / 7
156
- # elif current_granularity == "monthly":
157
- # new_row[col] = row[col] / row.name.days_in_month
158
- # else:
159
- # new_row[col] = row[col]
160
- # expanded_data.append((date, new_row))
161
-
162
- # resampled_df = pd.DataFrame(
163
- # [data for _, data in expanded_data],
164
- # index=[date for date, _ in expanded_data],
165
- # )
166
-
167
- # # Reset index
168
- # resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
169
-
170
- # return resampled_df
171
-
172
-
173
- def adjust_dataframe_granularity(df, current_granularity, target_granularity):
174
- # Set index
175
- df.set_index("date", inplace=True)
176
-
177
- # Define aggregation rules for resampling
178
- aggregation_rules = {
179
- col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
180
- for col in df.columns
181
- }
182
-
183
- # Initialize resampled_df
184
- resampled_df = df
185
- if current_granularity == "daily" and target_granularity == "weekly":
186
- resampled_df = df.resample("W-MON", closed="left", label="left").agg(
187
- aggregation_rules
188
- )
189
-
190
- elif current_granularity == "daily" and target_granularity == "monthly":
191
- resampled_df = df.resample("MS", closed="left", label="left").agg(
192
- aggregation_rules
193
- )
194
-
195
- elif current_granularity == "daily" and target_granularity == "daily":
196
- resampled_df = df.resample("D").agg(aggregation_rules)
197
-
198
- elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
199
- # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
200
- expanded_data = []
201
- for _, row in df.iterrows():
202
- if current_granularity == "weekly":
203
- period_range = pd.date_range(start=row.name, periods=7)
204
- elif current_granularity == "monthly":
205
- period_range = pd.date_range(
206
- start=row.name, periods=row.name.days_in_month
207
- )
208
-
209
- for date in period_range:
210
- new_row = {}
211
- for col in df.columns:
212
- if pd.api.types.is_numeric_dtype(df[col]):
213
- if current_granularity == "weekly":
214
- new_row[col] = row[col] / 7
215
- elif current_granularity == "monthly":
216
- new_row[col] = row[col] / row.name.days_in_month
217
- else:
218
- new_row[col] = row[col]
219
- expanded_data.append((date, new_row))
220
-
221
- resampled_df = pd.DataFrame(
222
- [data for _, data in expanded_data],
223
- index=[date for date, _ in expanded_data],
224
- )
225
-
226
- # Reset index
227
- resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
228
-
229
- return resampled_df
230
-
231
-
232
- # Function to clean and extract unique values of DMA and Panel
233
- st.cache_resource(show_spinner=False)
234
-
235
-
236
- def clean_and_extract_unique_values(files_dict, selections):
237
- all_dma_values = set()
238
- all_panel_values = set()
239
-
240
- for file_name, file_data in files_dict.items():
241
- df = file_data["df"]
242
-
243
- # 'DMA' and 'Panel' selections
244
- selected_dma = selections[file_name].get("DMA")
245
- selected_panel = selections[file_name].get("Panel")
246
-
247
- # Clean and standardize DMA column if it exists and is selected
248
- if selected_dma and selected_dma != "N/A" and selected_dma in df.columns:
249
- df[selected_dma] = (
250
- df[selected_dma].str.lower().str.strip().str.replace("_", " ")
251
- )
252
- all_dma_values.update(df[selected_dma].dropna().unique())
253
-
254
- # Clean and standardize Panel column if it exists and is selected
255
- if selected_panel and selected_panel != "N/A" and selected_panel in df.columns:
256
- df[selected_panel] = (
257
- df[selected_panel].str.lower().str.strip().str.replace("_", " ")
258
- )
259
- all_panel_values.update(df[selected_panel].dropna().unique())
260
-
261
- # Update the processed DataFrame back in the dictionary
262
- files_dict[file_name]["df"] = df
263
-
264
- return all_dma_values, all_panel_values
265
-
266
-
267
- # Function to format values for display
268
- st.cache_resource(show_spinner=False)
269
-
270
-
271
- def format_values_for_display(values_list):
272
- # Capitalize the first letter of each word and replace underscores with spaces
273
- formatted_list = [value.replace("_", " ").title() for value in values_list]
274
- # Join values with commas and 'and' before the last value
275
- if len(formatted_list) > 1:
276
- return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
277
- elif formatted_list:
278
- return formatted_list[0]
279
- return "No values available"
280
-
281
-
282
- # Function to normalizes all data within files_dict to a daily granularity
283
- st.cache(show_spinner=False, allow_output_mutation=True)
284
-
285
-
286
- def standardize_data_to_daily(files_dict, selections):
287
- # Normalize all data to a daily granularity using a provided function
288
- files_dict = apply_granularity_to_all(files_dict, "daily", selections)
289
-
290
- # Update the "interval" attribute for each dataset to indicate the new granularity
291
- for files_name, files_data in files_dict.items():
292
- files_data["interval"] = "daily"
293
-
294
- return files_dict
295
-
296
-
297
- # Function to apply granularity transformation to all DataFrames in files_dict
298
- st.cache_resource(show_spinner=False)
299
-
300
-
301
- def apply_granularity_to_all(files_dict, granularity_selection, selections):
302
- for file_name, file_data in files_dict.items():
303
- df = file_data["df"].copy()
304
-
305
- # Handling when DMA or Panel might be 'N/A'
306
- selected_dma = selections[file_name].get("DMA")
307
- selected_panel = selections[file_name].get("Panel")
308
-
309
- # Correcting the segment selection logic & handling 'N/A'
310
- if selected_dma != "N/A" and selected_panel != "N/A":
311
- unique_combinations = df[[selected_dma, selected_panel]].drop_duplicates()
312
- elif selected_dma != "N/A":
313
- unique_combinations = df[[selected_dma]].drop_duplicates()
314
- selected_panel = None # Ensure Panel is ignored if N/A
315
- elif selected_panel != "N/A":
316
- unique_combinations = df[[selected_panel]].drop_duplicates()
317
- selected_dma = None # Ensure DMA is ignored if N/A
318
- else:
319
- # If both are 'N/A', process the entire dataframe as is
320
- df = adjust_dataframe_granularity(
321
- df, file_data["interval"], granularity_selection
322
- )
323
- files_dict[file_name]["df"] = df
324
- continue # Skip to the next file
325
-
326
- transformed_segments = []
327
- for _, combo in unique_combinations.iterrows():
328
- if selected_dma and selected_panel:
329
- segment = df[
330
- (df[selected_dma] == combo[selected_dma])
331
- & (df[selected_panel] == combo[selected_panel])
332
- ]
333
- elif selected_dma:
334
- segment = df[df[selected_dma] == combo[selected_dma]]
335
- elif selected_panel:
336
- segment = df[df[selected_panel] == combo[selected_panel]]
337
-
338
- # Adjust granularity of the segment
339
- transformed_segment = adjust_dataframe_granularity(
340
- segment, file_data["interval"], granularity_selection
341
- )
342
- transformed_segments.append(transformed_segment)
343
-
344
- # Combine all transformed segments into a single DataFrame for this file
345
- transformed_df = pd.concat(transformed_segments, ignore_index=True)
346
- files_dict[file_name]["df"] = transformed_df
347
-
348
- return files_dict
349
-
350
-
351
- # Function to create main dataframe structure
352
- st.cache_resource(show_spinner=False)
353
-
354
-
355
- def create_main_dataframe(
356
- files_dict, all_dma_values, all_panel_values, granularity_selection
357
- ):
358
- # Determine the global start and end dates across all DataFrames
359
- global_start = min(df["df"]["date"].min() for df in files_dict.values())
360
- global_end = max(df["df"]["date"].max() for df in files_dict.values())
361
-
362
- # Adjust the date_range generation based on the granularity_selection
363
- if granularity_selection == "weekly":
364
- # Generate a weekly range, with weeks starting on Monday
365
- date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
366
- elif granularity_selection == "monthly":
367
- # Generate a monthly range, starting from the first day of each month
368
- date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
369
- else: # Default to daily if not weekly or monthly
370
- date_range = pd.date_range(start=global_start, end=global_end, freq="D")
371
-
372
- # Collect all unique DMA and Panel values, excluding 'N/A'
373
- all_dmas = all_dma_values
374
- all_panels = all_panel_values
375
-
376
- # Dynamically build the list of dimensions (Panel, DMA) to include in the main DataFrame based on availability
377
- dimensions, merge_keys = [], []
378
- if all_panels:
379
- dimensions.append(all_panels)
380
- merge_keys.append("Panel")
381
- if all_dmas:
382
- dimensions.append(all_dmas)
383
- merge_keys.append("DMA")
384
-
385
- dimensions.append(date_range) # Date range is always included
386
- merge_keys.append("date") # Date range is always included
387
-
388
- # Create a main DataFrame template with the dimensions
389
- main_df = pd.MultiIndex.from_product(
390
- dimensions,
391
- names=[name for name, _ in zip(merge_keys, dimensions)],
392
- ).to_frame(index=False)
393
-
394
- return main_df.reset_index(drop=True)
395
-
396
-
397
- # Function to prepare and merge dataFrames
398
- st.cache_resource(show_spinner=False)
399
-
400
-
401
- def merge_into_main_df(main_df, files_dict, selections):
402
- for file_name, file_data in files_dict.items():
403
- df = file_data["df"].copy()
404
-
405
- # Rename selected DMA and Panel columns if not 'N/A'
406
- selected_dma = selections[file_name].get("DMA", "N/A")
407
- selected_panel = selections[file_name].get("Panel", "N/A")
408
- if selected_dma != "N/A":
409
- df.rename(columns={selected_dma: "DMA"}, inplace=True)
410
- if selected_panel != "N/A":
411
- df.rename(columns={selected_panel: "Panel"}, inplace=True)
412
-
413
- # Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel' and 'DMA'
414
- merge_keys = ["date"]
415
- if "Panel" in df.columns:
416
- merge_keys.append("Panel")
417
- if "DMA" in df.columns:
418
- merge_keys.append("DMA")
419
- main_df = pd.merge(main_df, df, on=merge_keys, how="left")
420
-
421
- # After all merges, sort by 'date' and reset index for cleanliness
422
- sort_by = ["date"]
423
- if "Panel" in main_df.columns:
424
- sort_by.append("Panel")
425
- if "DMA" in main_df.columns:
426
- sort_by.append("DMA")
427
- main_df.sort_values(by=sort_by, inplace=True)
428
- main_df.reset_index(drop=True, inplace=True)
429
-
430
- return main_df
431
-
432
-
433
- # Function to categorize column
434
- def categorize_column(column_name):
435
- # Define keywords for each category
436
- internal_keywords = [
437
- "Price",
438
- "Discount",
439
- "product_price",
440
- "cost",
441
- "margin",
442
- "inventory",
443
- "sales",
444
- "revenue",
445
- "turnover",
446
- "expense",
447
- ]
448
- exogenous_keywords = [
449
- "GDP",
450
- "Tax",
451
- "Inflation",
452
- "interest_rate",
453
- "employment_rate",
454
- "exchange_rate",
455
- "consumer_spending",
456
- "retail_sales",
457
- "oil_prices",
458
- "weather",
459
- ]
460
-
461
- # Check if the column name matches any of the keywords for Internal or Exogenous categories
462
- for keyword in internal_keywords:
463
- if keyword.lower() in column_name.lower():
464
- return "Internal"
465
- for keyword in exogenous_keywords:
466
- if keyword.lower() in column_name.lower():
467
- return "Exogenous"
468
-
469
- # Default to Media if no match found
470
- return "Media"
471
-
472
-
473
- # Function to calculate missing stats and prepare for editable DataFrame
474
- st.cache_resource(show_spinner=False)
475
-
476
-
477
- def prepare_missing_stats_df(df):
478
- missing_stats = []
479
- for column in df.columns:
480
- if (
481
- column == "date" or column == "DMA" or column == "Panel"
482
- ): # Skip Date, DMA and Panel column
483
- continue
484
-
485
- missing = df[column].isnull().sum()
486
- pct_missing = round((missing / len(df)) * 100, 2)
487
-
488
- # Dynamically assign category based on column name
489
- # category = categorize_column(column)
490
- category = "Media"
491
-
492
- missing_stats.append(
493
- {
494
- "Column": column,
495
- "Missing Values": missing,
496
- "Missing Percentage": pct_missing,
497
- "Impute Method": "Fill with 0", # Default value
498
- "Category": category,
499
- }
500
- )
501
- stats_df = pd.DataFrame(missing_stats)
502
-
503
- return stats_df
504
-
505
-
506
- # Function to add API DataFrame details to the files dictionary
507
- st.cache_resource(show_spinner=False)
508
-
509
-
510
- def add_api_dataframe_to_dict(main_df, files_dict):
511
- files_dict["API"] = {
512
- "numeric": list(main_df.select_dtypes(include=["number"]).columns),
513
- "non_numeric": [
514
- col
515
- for col in main_df.select_dtypes(exclude=["number"]).columns
516
- if col.lower() != "date"
517
- ],
518
- "interval": determine_data_interval(
519
- pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
520
- ),
521
- "df": main_df,
522
- }
523
-
524
- return files_dict
525
-
526
-
527
- # Function to reads an API into a DataFrame, parsing specified columns as datetime
528
- @st.cache_resource(show_spinner=False)
529
- def read_API_data():
530
- return pd.read_excel(r"upf_data_converted.xlsx", parse_dates=["Date"])
531
-
532
-
533
- # Function to set the 'DMA_Panel_Selected' session state variable to False
534
- def set_DMA_Panel_Selected_false():
535
- st.session_state["DMA_Panel_Selected"] = False
536
-
537
-
538
- # Initialize 'final_df' in session state
539
- if "final_df" not in st.session_state:
540
- st.session_state["final_df"] = pd.DataFrame()
541
-
542
- # Initialize 'bin_dict' in session state
543
- if "bin_dict" not in st.session_state:
544
- st.session_state["bin_dict"] = {}
545
-
546
- # Initialize 'DMA_Panel_Selected' in session state
547
- if "DMA_Panel_Selected" not in st.session_state:
548
- st.session_state["DMA_Panel_Selected"] = False
549
-
550
- # Page Title
551
- st.write("") # Top padding
552
- st.title("Data Import")
553
-
554
-
555
- #########################################################################################################################################################
556
- # Create a dictionary to hold all DataFrames and collect user input to specify "DMA" and "Panel" columns for each file
557
- #########################################################################################################################################################
558
-
559
-
560
- # Read the Excel file, parsing 'Date' column as datetime
561
- main_df = read_API_data()
562
-
563
- # Convert all column names to lowercase
564
- main_df.columns = main_df.columns.str.lower().str.strip()
565
-
566
- # File uploader
567
- uploaded_files = st.file_uploader(
568
- "Upload additional data",
569
- type=["xlsx"],
570
- accept_multiple_files=True,
571
- on_change=set_DMA_Panel_Selected_false,
572
- )
573
-
574
- # Custom HTML for upload instructions
575
- recommendation_html = f"""
576
- <div style="text-align: justify;">
577
- <strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including DMA, Panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
578
- </div>
579
- """
580
- st.markdown(recommendation_html, unsafe_allow_html=True)
581
-
582
- # Choose Date Granularity
583
- st.markdown("#### Choose Date Granularity")
584
- # Granularity Selection
585
- granularity_selection = st.selectbox(
586
- "Choose Date Granularity",
587
- ["Daily", "Weekly", "Monthly"],
588
- label_visibility="collapsed",
589
- on_change=set_DMA_Panel_Selected_false,
590
- )
591
- granularity_selection = str(granularity_selection).lower()
592
-
593
- # Convert files to dataframes
594
- files_dict = files_to_dataframes(uploaded_files)
595
-
596
- # Add API Dataframe
597
- if main_df is not None:
598
- files_dict = add_api_dataframe_to_dict(main_df, files_dict)
599
-
600
- # Display a warning message if no files have been uploaded and halt further execution
601
- if not files_dict:
602
- st.warning(
603
- "Please upload at least one file to proceed.",
604
- icon="⚠️",
605
- )
606
- st.stop() # Halts further execution until file is uploaded
607
-
608
-
609
- # Select DMA and Panel columns
610
- st.markdown("#### Select DMA and Panel columns")
611
- selections = {}
612
- with st.expander("Select DMA and Panel columns", expanded=False):
613
- count = 0 # Initialize counter to manage the visibility of labels and keys
614
- for file_name, file_data in files_dict.items():
615
- # Determine visibility of the label based on the count
616
- if count == 0:
617
- label_visibility = "visible"
618
- else:
619
- label_visibility = "collapsed"
620
-
621
- # Extract non-numeric columns
622
- non_numeric_cols = file_data["non_numeric"]
623
-
624
- # Prepare DMA and Panel values for dropdown, adding "N/A" as an option
625
- dma_values = non_numeric_cols + ["N/A"]
626
- panel_values = non_numeric_cols + ["N/A"]
627
-
628
- # Skip if only one option is available
629
- if len(dma_values) == 1 and len(panel_values) == 1:
630
- selected_dma, selected_panel = "N/A", "N/A"
631
- # Update the selections for DMA and Panel for the current file
632
- selections[file_name] = {
633
- "DMA": selected_dma,
634
- "Panel": selected_panel,
635
- }
636
- continue
637
-
638
- # Create layout columns for File Name, DMA, and Panel selections
639
- file_name_col, DMA_col, Panel_col = st.columns([2, 4, 4])
640
-
641
- with file_name_col:
642
- # Display "File Name" label only for the first file
643
- if count == 0:
644
- st.write("File Name")
645
- else:
646
- st.write("")
647
- st.write(file_name) # Display the file name
648
-
649
- with DMA_col:
650
- # Display a selectbox for DMA values
651
- selected_dma = st.selectbox(
652
- "Select DMA",
653
- dma_values,
654
- on_change=set_DMA_Panel_Selected_false,
655
- label_visibility=label_visibility, # Control visibility of the label
656
- key=f"DMA_selectbox{count}", # Ensure unique key for each selectbox
657
- )
658
-
659
- with Panel_col:
660
- # Display a selectbox for Panel values
661
- selected_panel = st.selectbox(
662
- "Select Panel",
663
- panel_values,
664
- on_change=set_DMA_Panel_Selected_false,
665
- label_visibility=label_visibility, # Control visibility of the label
666
- key=f"Panel_selectbox{count}", # Ensure unique key for each selectbox
667
- )
668
-
669
- # Skip processing if the same column is selected for both Panel and DMA due to potential data integrity issues
670
- if selected_panel == selected_dma and not (
671
- selected_panel == "N/A" and selected_dma == "N/A"
672
- ):
673
- st.warning(
674
- f"File: {file_name} → The same column cannot serve as both Panel and DMA. Please adjust your selections.",
675
- )
676
- selected_dma, selected_panel = "N/A", "N/A"
677
- st.stop()
678
-
679
- # Update the selections for DMA and Panel for the current file
680
- selections[file_name] = {
681
- "DMA": selected_dma,
682
- "Panel": selected_panel,
683
- }
684
-
685
- count += 1 # Increment the counter after processing each file
686
-
687
- # Accept DMA and Panel selection
688
- if st.button("Accept and Process", use_container_width=True):
689
-
690
- # Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
691
- with st.spinner("Processing...", cache=True):
692
- files_dict = standardize_data_to_daily(files_dict, selections)
693
-
694
- # Convert all data to daily level granularity
695
- files_dict = apply_granularity_to_all(
696
- files_dict, granularity_selection, selections
697
- )
698
-
699
- st.session_state["files_dict"] = files_dict
700
- st.session_state["DMA_Panel_Selected"] = True
701
-
702
-
703
- #########################################################################################################################################################
704
- # Display unique DMA and Panel values
705
- #########################################################################################################################################################
706
-
707
-
708
- # Halts further execution until DMA and Panel columns are selected
709
- if "files_dict" in st.session_state and st.session_state["DMA_Panel_Selected"]:
710
- files_dict = st.session_state["files_dict"]
711
- else:
712
- st.stop()
713
-
714
- # Set to store unique values of DMA and Panel
715
- with st.spinner("Fetching DMA and Panel values..."):
716
- all_dma_values, all_panel_values = clean_and_extract_unique_values(
717
- files_dict, selections
718
- )
719
-
720
- # List of DMA and Panel columns unique values
721
- list_of_all_dma_values = list(all_dma_values)
722
- list_of_all_panel_values = list(all_panel_values)
723
-
724
- # Format DMA and Panel values for display
725
- formatted_dma_values = format_values_for_display(list_of_all_dma_values)
726
- formatted_panel_values = format_values_for_display(list_of_all_panel_values)
727
-
728
- # Unique DMA and Panel values
729
- st.markdown("#### Unique DMA and Panel values")
730
- # Display DMA and Panel values
731
- with st.expander("Unique DMA and Panel values"):
732
- st.write("")
733
- st.markdown(
734
- f"""
735
- <style>
736
- .justify-text {{
737
- text-align: justify;
738
- }}
739
- </style>
740
- <div class="justify-text">
741
- <strong>Panel Values:</strong> {formatted_panel_values}<br>
742
- <strong>DMA Values:</strong> {formatted_dma_values}
743
- </div>
744
- """,
745
- unsafe_allow_html=True,
746
- )
747
-
748
- # Display total DMA and Panel
749
- st.write("")
750
- st.markdown(
751
- f"""
752
- <div style="text-align: justify;">
753
- <strong>Number of DMAs detected:</strong> {len(list_of_all_dma_values)}<br>
754
- <strong>Number of Panels detected:</strong> {len(list_of_all_panel_values)}
755
- </div>
756
- """,
757
- unsafe_allow_html=True,
758
- )
759
- st.write("")
760
-
761
-
762
- #########################################################################################################################################################
763
- # Merge all DataFrames
764
- #########################################################################################################################################################
765
-
766
-
767
- # Merge all DataFrames selected
768
- main_df = create_main_dataframe(
769
- files_dict, all_dma_values, all_panel_values, granularity_selection
770
- )
771
- merged_df = merge_into_main_df(main_df, files_dict, selections)
772
-
773
- # # Display the merged DataFrame
774
- # st.markdown("#### Merged DataFrame based on selected DMA and Panel")
775
- # st.dataframe(merged_df)
776
-
777
-
778
- #########################################################################################################################################################
779
- # Categorize Variables and Impute Missing Values
780
- #########################################################################################################################################################
781
-
782
-
783
- # Create an editable DataFrame in Streamlit
784
- st.markdown("#### Select Variables Category & Impute Missing Values")
785
-
786
- # Prepare missing stats DataFrame for editing
787
- missing_stats_df = prepare_missing_stats_df(merged_df)
788
-
789
- edited_stats_df = st.data_editor(
790
- missing_stats_df,
791
- column_config={
792
- "Impute Method": st.column_config.SelectboxColumn(
793
- options=[
794
- "Drop Column",
795
- "Fill with Mean",
796
- "Fill with Median",
797
- "Fill with 0",
798
- ],
799
- required=True,
800
- default="Fill with 0",
801
- ),
802
- "Category": st.column_config.SelectboxColumn(
803
- options=[
804
- "Media",
805
- "Exogenous",
806
- "Internal",
807
- "Response_Metric"
808
- ],
809
- required=True,
810
- default="Media",
811
- ),
812
- },
813
- disabled=["Column", "Missing Values", "Missing Percentage"],
814
- hide_index=True,
815
- use_container_width=True,
816
- )
817
-
818
- # Apply changes based on edited DataFrame
819
- for i, row in edited_stats_df.iterrows():
820
- column = row["Column"]
821
- if row["Impute Method"] == "Drop Column":
822
- merged_df.drop(columns=[column], inplace=True)
823
-
824
- elif row["Impute Method"] == "Fill with Mean":
825
- merged_df[column].fillna(merged_df[column].mean(), inplace=True)
826
-
827
- elif row["Impute Method"] == "Fill with Median":
828
- merged_df[column].fillna(merged_df[column].median(), inplace=True)
829
-
830
- elif row["Impute Method"] == "Fill with 0":
831
- merged_df[column].fillna(0, inplace=True)
832
-
833
- # Display the Final DataFrame and exogenous variables
834
- st.markdown("#### Final DataFrame")
835
- final_df = merged_df
836
- st.dataframe(final_df, hide_index=True)
837
-
838
- # Initialize an empty dictionary to hold categories and their variables
839
- category_dict = {}
840
-
841
- # Iterate over each row in the edited DataFrame to populate the dictionary
842
- for i, row in edited_stats_df.iterrows():
843
- column = row["Column"]
844
- category = row["Category"] # The category chosen by the user for this variable
845
-
846
- # Check if the category already exists in the dictionary
847
- if category not in category_dict:
848
- # If not, initialize it with the current column as its first element
849
- category_dict[category] = [column]
850
- else:
851
- # If it exists, append the current column to the list of variables under this category
852
- category_dict[category].append(column)
853
-
854
- # Add Date, DMA and Panel in category dictionary
855
- category_dict.update({"Date": ["date"]})
856
- if "DMA" in final_df.columns:
857
- category_dict["DMA"] = ["DMA"]
858
-
859
- if "Panel" in final_df.columns:
860
- category_dict["Panel"] = ["Panel"]
861
-
862
- # Display the dictionary
863
- st.markdown("#### Variable Category")
864
- for category, variables in category_dict.items():
865
- # Check if there are multiple variables to handle "and" insertion correctly
866
- if len(variables) > 1:
867
- # Join all but the last variable with ", ", then add " and " before the last variable
868
- variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
869
- else:
870
- # If there's only one variable, no need for "and"
871
- variables_str = variables[0]
872
-
873
- # Display the category and its variables in the desired format
874
- st.markdown(
875
- f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
876
- unsafe_allow_html=True,
877
- )
878
-
879
- # Store final dataframe and bin dictionary into session state
880
- st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
881
-
882
- if st.button('Save Changes'):
883
-
884
- with open("Pickle_files/main_df", 'wb') as f:
885
- pickle.dump(st.session_state["final_df"], f)
886
- with open("Pickle_files/category_dict",'wb') as c:
887
- pickle.dump(st.session_state["bin_dict"],c)
888
- st.success('Changes Saved!')
889
-
890
-
891
-