BlendMMM commited on
Commit
07855e7
·
verified ·
1 Parent(s): 13a2588

Delete Data_Import (1).py

Browse files
Files changed (1) hide show
  1. Data_Import (1).py +0 -995
Data_Import (1).py DELETED
@@ -1,995 +0,0 @@
1
- # Importing necessary libraries
2
- import streamlit as st
3
-
4
- st.set_page_config(
5
- page_title="Data Import",
6
- page_icon=":shark:",
7
- layout="wide",
8
- initial_sidebar_state="collapsed",
9
- )
10
-
11
- import pickle
12
- import pandas as pd
13
- from utilities import set_header, load_local_css, authentication
14
-
15
- load_local_css("styles.css")
16
- set_header()
17
-
18
-
19
- # Check for authentication status
20
- authenticator, name, authentication_status, username = authentication()
21
- if authentication_status != True:
22
- st.stop()
23
- else:
24
- authenticator.logout("Logout", "main")
25
-
26
-
27
- # Function to validate date column in dataframe
28
- def validate_date_column(df):
29
- try:
30
- # Attempt to convert the 'Date' column to datetime
31
- df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
32
- return True
33
- except:
34
- return False
35
-
36
-
37
- # Function to determine data interval
38
- def determine_data_interval(common_freq):
39
- if common_freq == 1:
40
- return "daily"
41
- elif common_freq == 7:
42
- return "weekly"
43
- elif 28 <= common_freq <= 31:
44
- return "monthly"
45
- else:
46
- return "irregular"
47
-
48
-
49
- # Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
50
- st.cache_resource(show_spinner=False)
51
-
52
-
53
- def files_to_dataframes(uploaded_files):
54
- df_dict = {}
55
- for uploaded_file in uploaded_files:
56
- # Extract file name without extension
57
- file_name = uploaded_file.name.rsplit(".", 1)[0]
58
-
59
- # Check for duplicate file names
60
- if file_name in df_dict:
61
- st.warning(
62
- f"Duplicate File: {file_name}. This file will be skipped.",
63
- icon="⚠️",
64
- )
65
- continue
66
-
67
- # Read the file into a DataFrame
68
- df = pd.read_excel(uploaded_file)
69
-
70
- # Convert all column names to lowercase
71
- df.columns = df.columns.str.lower().str.strip()
72
-
73
- # Separate numeric and non-numeric columns
74
- numeric_cols = list(df.select_dtypes(include=["number"]).columns)
75
- non_numeric_cols = [
76
- col
77
- for col in df.select_dtypes(exclude=["number"]).columns
78
- if col.lower() != "date"
79
- ]
80
-
81
- # Check for 'Date' column
82
- if not (validate_date_column(df) and len(numeric_cols) > 0):
83
- st.warning(
84
- f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
85
- icon="⚠️",
86
- )
87
- continue
88
-
89
- # Check for interval
90
- common_freq = common_freq = (
91
- pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
92
- )
93
- # Calculate the data interval (daily, weekly, monthly or irregular)
94
- interval = determine_data_interval(common_freq)
95
- if interval == "irregular":
96
- st.warning(
97
- f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
98
- icon="⚠️",
99
- )
100
- continue
101
-
102
- # Store both DataFrames in the dictionary under their respective keys
103
- df_dict[file_name] = {
104
- "numeric": numeric_cols,
105
- "non_numeric": non_numeric_cols,
106
- "interval": interval,
107
- "df": df,
108
- }
109
-
110
- return df_dict
111
-
112
-
113
- # Function to adjust dataframe granularity
114
- def adjust_dataframe_granularity(df, current_granularity, target_granularity):
115
- # Set index
116
- df.set_index("date", inplace=True)
117
-
118
- # Define aggregation rules for resampling
119
- aggregation_rules = {
120
- col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
121
- for col in df.columns
122
- }
123
-
124
- # Initialize resampled_df
125
- resampled_df = df
126
- if current_granularity == "daily" and target_granularity == "weekly":
127
- resampled_df = df.resample("W-MON", closed="left", label="left").agg(
128
- aggregation_rules
129
- )
130
-
131
- elif current_granularity == "daily" and target_granularity == "monthly":
132
- resampled_df = df.resample("MS", closed="left", label="left").agg(
133
- aggregation_rules
134
- )
135
-
136
- elif current_granularity == "daily" and target_granularity == "daily":
137
- resampled_df = df.resample("D").agg(aggregation_rules)
138
-
139
- elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
140
- # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
141
- expanded_data = []
142
- for _, row in df.iterrows():
143
- if current_granularity == "weekly":
144
- period_range = pd.date_range(start=row.name, periods=7)
145
- elif current_granularity == "monthly":
146
- period_range = pd.date_range(
147
- start=row.name, periods=row.name.days_in_month
148
- )
149
-
150
- for date in period_range:
151
- new_row = {}
152
- for col in df.columns:
153
- if pd.api.types.is_numeric_dtype(df[col]):
154
- if current_granularity == "weekly":
155
- new_row[col] = row[col] / 7
156
- elif current_granularity == "monthly":
157
- new_row[col] = row[col] / row.name.days_in_month
158
- else:
159
- new_row[col] = row[col]
160
- expanded_data.append((date, new_row))
161
-
162
- resampled_df = pd.DataFrame(
163
- [data for _, data in expanded_data],
164
- index=[date for date, _ in expanded_data],
165
- )
166
-
167
- # Reset index
168
- resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
169
-
170
- return resampled_df
171
-
172
-
173
- # Function to clean and extract unique values of Panel_1 and Panel_2
174
- st.cache_resource(show_spinner=False)
175
-
176
-
177
- def clean_and_extract_unique_values(files_dict, selections):
178
- all_panel1_values = set()
179
- all_panel2_values = set()
180
-
181
- for file_name, file_data in files_dict.items():
182
- df = file_data["df"]
183
-
184
- # 'Panel_1' and 'Panel_2' selections
185
- selected_panel1 = selections[file_name].get("Panel_1")
186
- selected_panel2 = selections[file_name].get("Panel_2")
187
-
188
- # Clean and standardize Panel_1 column if it exists and is selected
189
- if (
190
- selected_panel1
191
- and selected_panel1 != "N/A"
192
- and selected_panel1 in df.columns
193
- ):
194
- df[selected_panel1] = (
195
- df[selected_panel1].str.lower().str.strip().str.replace("_", " ")
196
- )
197
- all_panel1_values.update(df[selected_panel1].dropna().unique())
198
-
199
- # Clean and standardize Panel_2 column if it exists and is selected
200
- if (
201
- selected_panel2
202
- and selected_panel2 != "N/A"
203
- and selected_panel2 in df.columns
204
- ):
205
- df[selected_panel2] = (
206
- df[selected_panel2].str.lower().str.strip().str.replace("_", " ")
207
- )
208
- all_panel2_values.update(df[selected_panel2].dropna().unique())
209
-
210
- # Update the processed DataFrame back in the dictionary
211
- files_dict[file_name]["df"] = df
212
-
213
- return all_panel1_values, all_panel2_values
214
-
215
-
216
- # Function to format values for display
217
- st.cache_resource(show_spinner=False)
218
-
219
-
220
- def format_values_for_display(values_list):
221
- # Capitalize the first letter of each word and replace underscores with spaces
222
- formatted_list = [value.replace("_", " ").title() for value in values_list]
223
- # Join values with commas and 'and' before the last value
224
- if len(formatted_list) > 1:
225
- return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
226
- elif formatted_list:
227
- return formatted_list[0]
228
- return "No values available"
229
-
230
-
231
- # Function to normalizes all data within files_dict to a daily granularity
232
- st.cache(show_spinner=False, allow_output_mutation=True)
233
-
234
-
235
- def standardize_data_to_daily(files_dict, selections):
236
- # Normalize all data to a daily granularity using a provided function
237
- files_dict = apply_granularity_to_all(files_dict, "daily", selections)
238
-
239
- # Update the "interval" attribute for each dataset to indicate the new granularity
240
- for files_name, files_data in files_dict.items():
241
- files_data["interval"] = "daily"
242
-
243
- return files_dict
244
-
245
-
246
- # Function to apply granularity transformation to all DataFrames in files_dict
247
- st.cache_resource(show_spinner=False)
248
-
249
-
250
- def apply_granularity_to_all(files_dict, granularity_selection, selections):
251
- for file_name, file_data in files_dict.items():
252
- df = file_data["df"].copy()
253
-
254
- # Handling when Panel_1 or Panel_2 might be 'N/A'
255
- selected_panel1 = selections[file_name].get("Panel_1")
256
- selected_panel2 = selections[file_name].get("Panel_2")
257
-
258
- # Correcting the segment selection logic & handling 'N/A'
259
- if selected_panel1 != "N/A" and selected_panel2 != "N/A":
260
- unique_combinations = df[
261
- [selected_panel1, selected_panel2]
262
- ].drop_duplicates()
263
- elif selected_panel1 != "N/A":
264
- unique_combinations = df[[selected_panel1]].drop_duplicates()
265
- selected_panel2 = None # Ensure Panel_2 is ignored if N/A
266
- elif selected_panel2 != "N/A":
267
- unique_combinations = df[[selected_panel2]].drop_duplicates()
268
- selected_panel1 = None # Ensure Panel_1 is ignored if N/A
269
- else:
270
- # If both are 'N/A', process the entire dataframe as is
271
- df = adjust_dataframe_granularity(
272
- df, file_data["interval"], granularity_selection
273
- )
274
- files_dict[file_name]["df"] = df
275
- continue # Skip to the next file
276
-
277
- transformed_segments = []
278
- for _, combo in unique_combinations.iterrows():
279
- if selected_panel1 and selected_panel2:
280
- segment = df[
281
- (df[selected_panel1] == combo[selected_panel1])
282
- & (df[selected_panel2] == combo[selected_panel2])
283
- ]
284
- elif selected_panel1:
285
- segment = df[df[selected_panel1] == combo[selected_panel1]]
286
- elif selected_panel2:
287
- segment = df[df[selected_panel2] == combo[selected_panel2]]
288
-
289
- # Adjust granularity of the segment
290
- transformed_segment = adjust_dataframe_granularity(
291
- segment, file_data["interval"], granularity_selection
292
- )
293
- transformed_segments.append(transformed_segment)
294
-
295
- # Combine all transformed segments into a single DataFrame for this file
296
- transformed_df = pd.concat(transformed_segments, ignore_index=True)
297
- files_dict[file_name]["df"] = transformed_df
298
-
299
- return files_dict
300
-
301
-
302
- # Function to create main dataframe structure
303
- st.cache_resource(show_spinner=False)
304
-
305
-
306
- def create_main_dataframe(
307
- files_dict, all_panel1_values, all_panel2_values, granularity_selection
308
- ):
309
- # Determine the global start and end dates across all DataFrames
310
- global_start = min(df["df"]["date"].min() for df in files_dict.values())
311
- global_end = max(df["df"]["date"].max() for df in files_dict.values())
312
-
313
- # Adjust the date_range generation based on the granularity_selection
314
- if granularity_selection == "weekly":
315
- # Generate a weekly range, with weeks starting on Monday
316
- date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
317
- elif granularity_selection == "monthly":
318
- # Generate a monthly range, starting from the first day of each month
319
- date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
320
- else: # Default to daily if not weekly or monthly
321
- date_range = pd.date_range(start=global_start, end=global_end, freq="D")
322
-
323
- # Collect all unique Panel_1 and Panel_2 values, excluding 'N/A'
324
- all_panel1s = all_panel1_values
325
- all_panel2s = all_panel2_values
326
-
327
- # Dynamically build the list of dimensions (Panel_1, Panel_2) to include in the main DataFrame based on availability
328
- dimensions, merge_keys = [], []
329
- if all_panel1s:
330
- dimensions.append(all_panel1s)
331
- merge_keys.append("Panel_1")
332
- if all_panel2s:
333
- dimensions.append(all_panel2s)
334
- merge_keys.append("Panel_2")
335
-
336
- dimensions.append(date_range) # Date range is always included
337
- merge_keys.append("date") # Date range is always included
338
-
339
- # Create a main DataFrame template with the dimensions
340
- main_df = pd.MultiIndex.from_product(
341
- dimensions,
342
- names=[name for name, _ in zip(merge_keys, dimensions)],
343
- ).to_frame(index=False)
344
-
345
- return main_df.reset_index(drop=True)
346
-
347
-
348
- # Function to prepare and merge dataFrames
349
- st.cache_resource(show_spinner=False)
350
-
351
-
352
- def merge_into_main_df(main_df, files_dict, selections):
353
- for file_name, file_data in files_dict.items():
354
- df = file_data["df"].copy()
355
-
356
- # Rename selected Panel_1 and Panel_2 columns if not 'N/A'
357
- selected_panel1 = selections[file_name].get("Panel_1", "N/A")
358
- selected_panel2 = selections[file_name].get("Panel_2", "N/A")
359
- if selected_panel1 != "N/A":
360
- df.rename(columns={selected_panel1: "Panel_1"}, inplace=True)
361
- if selected_panel2 != "N/A":
362
- df.rename(columns={selected_panel2: "Panel_2"}, inplace=True)
363
-
364
- # Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel_1' and 'Panel_2'
365
- merge_keys = ["date"]
366
- if "Panel_1" in df.columns:
367
- merge_keys.append("Panel_1")
368
- if "Panel_2" in df.columns:
369
- merge_keys.append("Panel_2")
370
- main_df = pd.merge(main_df, df, on=merge_keys, how="left")
371
-
372
- # After all merges, sort by 'date' and reset index for cleanliness
373
- sort_by = ["date"]
374
- if "Panel_1" in main_df.columns:
375
- sort_by.append("Panel_1")
376
- if "Panel_2" in main_df.columns:
377
- sort_by.append("Panel_2")
378
- main_df.sort_values(by=sort_by, inplace=True)
379
- main_df.reset_index(drop=True, inplace=True)
380
-
381
- return main_df
382
-
383
-
384
- # Function to categorize column
385
- def categorize_column(column_name):
386
- # Define keywords for each category
387
- internal_keywords = [
388
- "Price",
389
- "Discount",
390
- "product_price",
391
- "cost",
392
- "margin",
393
- "inventory",
394
- "sales",
395
- "revenue",
396
- "turnover",
397
- "expense",
398
- ]
399
- exogenous_keywords = [
400
- "GDP",
401
- "Tax",
402
- "Inflation",
403
- "interest_rate",
404
- "employment_rate",
405
- "exchange_rate",
406
- "consumer_spending",
407
- "retail_sales",
408
- "oil_prices",
409
- "weather",
410
- ]
411
-
412
- # Check if the column name matches any of the keywords for Internal or Exogenous categories
413
- for keyword in internal_keywords:
414
- if keyword.lower() in column_name.lower():
415
- return "Internal"
416
- for keyword in exogenous_keywords:
417
- if keyword.lower() in column_name.lower():
418
- return "Exogenous"
419
-
420
- # Default to Media if no match found
421
- return "Media"
422
-
423
-
424
- # Function to calculate missing stats and prepare for editable DataFrame
425
- st.cache_resource(show_spinner=False)
426
-
427
-
428
- def prepare_missing_stats_df(df):
429
- missing_stats = []
430
- for column in df.columns:
431
- if (
432
- column == "date" or column == "Panel_2" or column == "Panel_1"
433
- ): # Skip Date, Panel_1 and Panel_2 column
434
- continue
435
-
436
- missing = df[column].isnull().sum()
437
- pct_missing = round((missing / len(df)) * 100, 2)
438
-
439
- # Dynamically assign category based on column name
440
- category = categorize_column(column)
441
- # category = "Media" # Keep default bin as Media
442
-
443
- missing_stats.append(
444
- {
445
- "Column": column,
446
- "Missing Values": missing,
447
- "Missing Percentage": pct_missing,
448
- "Impute Method": "Fill with 0", # Default value
449
- "Category": category,
450
- }
451
- )
452
- stats_df = pd.DataFrame(missing_stats)
453
-
454
- return stats_df
455
-
456
-
457
- # Function to add API DataFrame details to the files dictionary
458
- st.cache_resource(show_spinner=False)
459
-
460
-
461
- def add_api_dataframe_to_dict(main_df, files_dict):
462
- files_dict["API"] = {
463
- "numeric": list(main_df.select_dtypes(include=["number"]).columns),
464
- "non_numeric": [
465
- col
466
- for col in main_df.select_dtypes(exclude=["number"]).columns
467
- if col.lower() != "date"
468
- ],
469
- "interval": determine_data_interval(
470
- pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
471
- ),
472
- "df": main_df,
473
- }
474
-
475
- return files_dict
476
-
477
-
478
- # Function to reads an API into a DataFrame, parsing specified columns as datetime
479
- @st.cache_resource(show_spinner=False)
480
- def read_API_data():
481
- return pd.read_excel(r".\upf_data_converted.xlsx", parse_dates=["Date"])
482
-
483
-
484
- # Function to set the 'Panel_1_Panel_2_Selected' session state variable to False
485
- def set_Panel_1_Panel_2_Selected_false():
486
- st.session_state["Panel_1_Panel_2_Selected"] = False
487
-
488
-
489
- # Function to serialize and save the objects into a pickle file
490
- @st.cache_resource(show_spinner=False)
491
- def save_to_pickle(file_path, final_df, bin_dict):
492
- # Open the file in write-binary mode and dump the objects
493
- with open(file_path, "wb") as f:
494
- pickle.dump({"final_df": final_df, "bin_dict": bin_dict}, f)
495
- # Data is now saved to file
496
-
497
-
498
- # Function to processes the merged_df DataFrame based on operations defined in edited_df
499
- @st.cache_resource(show_spinner=False)
500
- def process_dataframes(merged_df, edited_df, edited_stats_df):
501
- # Ensure there are operations defined by the user
502
- if edited_df.empty:
503
- return merged_df, edited_stats_df # No operations to apply
504
-
505
- # Perform operations as defined by the user
506
- for index, row in edited_df.iterrows():
507
- result_column_name = f"{row['Column 1']}{row['Operator']}{row['Column 2']}"
508
- col1 = row["Column 1"]
509
- col2 = row["Column 2"]
510
- op = row["Operator"]
511
-
512
- # Apply the specified operation
513
- if op == "+":
514
- merged_df[result_column_name] = merged_df[col1] + merged_df[col2]
515
- elif op == "-":
516
- merged_df[result_column_name] = merged_df[col1] - merged_df[col2]
517
- elif op == "*":
518
- merged_df[result_column_name] = merged_df[col1] * merged_df[col2]
519
- elif op == "/":
520
- merged_df[result_column_name] = merged_df[col1] / merged_df[col2].replace(
521
- 0, 1e-9
522
- )
523
-
524
- # Add summary of operation to edited_stats_df
525
- new_row = {
526
- "Column": result_column_name,
527
- "Missing Values": None,
528
- "Missing Percentage": None,
529
- "Impute Method": None,
530
- "Category": row["Category"],
531
- }
532
- new_row_df = pd.DataFrame([new_row])
533
-
534
- # Use pd.concat to add the new_row_df to edited_stats_df
535
- edited_stats_df = pd.concat(
536
- [edited_stats_df, new_row_df], ignore_index=True, axis=0
537
- )
538
-
539
- # Combine column names from edited_df for cleanup
540
- combined_columns = set(edited_df["Column 1"]).union(set(edited_df["Column 2"]))
541
-
542
- # Filter out rows in edited_stats_df and drop columns from merged_df
543
- edited_stats_df = edited_stats_df[~edited_stats_df["Column"].isin(combined_columns)]
544
- merged_df.drop(columns=list(combined_columns), errors="ignore", inplace=True)
545
-
546
- return merged_df, edited_stats_df
547
-
548
-
549
- # Function to prepare a list of numeric column names and initialize an empty DataFrame with predefined structure
550
- st.cache_resource(show_spinner=False)
551
-
552
-
553
- def prepare_numeric_columns_and_default_df(merged_df, edited_stats_df):
554
- # Get columns categorized as 'Response Metrics'
555
- columns_response_metrics = edited_stats_df[
556
- edited_stats_df["Category"] == "Response Metrics"
557
- ]["Column"].tolist()
558
-
559
- # Filter numeric columns, excluding those categorized as 'Response Metrics'
560
- numeric_columns = [
561
- col
562
- for col in merged_df.select_dtypes(include=["number"]).columns
563
- if col not in columns_response_metrics
564
- ]
565
-
566
- # Define the structure of the empty DataFrame
567
- data = {
568
- "Column 1": pd.Series([], dtype="str"),
569
- "Operator": pd.Series([], dtype="str"),
570
- "Column 2": pd.Series([], dtype="str"),
571
- "Category": pd.Series([], dtype="str"),
572
- }
573
- default_df = pd.DataFrame(data)
574
-
575
- return numeric_columns, default_df
576
-
577
-
578
- # Initialize 'final_df' in session state
579
- if "final_df" not in st.session_state:
580
- st.session_state["final_df"] = pd.DataFrame()
581
-
582
- # Initialize 'bin_dict' in session state
583
- if "bin_dict" not in st.session_state:
584
- st.session_state["bin_dict"] = {}
585
-
586
- # Initialize 'Panel_1_Panel_2_Selected' in session state
587
- if "Panel_1_Panel_2_Selected" not in st.session_state:
588
- st.session_state["Panel_1_Panel_2_Selected"] = False
589
-
590
-
591
- # Page Title
592
- st.write("") # Top padding
593
- st.title("Data Import")
594
-
595
-
596
- #########################################################################################################################################################
597
- # Create a dictionary to hold all DataFrames and collect user input to specify "Panel_2" and "Panel_1" columns for each file
598
- #########################################################################################################################################################
599
-
600
-
601
- # Read the Excel file, parsing 'Date' column as datetime
602
- main_df = read_API_data()
603
-
604
- # Convert all column names to lowercase
605
- main_df.columns = main_df.columns.str.lower().str.strip()
606
-
607
- # File uploader
608
- uploaded_files = st.file_uploader(
609
- "Upload additional data",
610
- type=["xlsx"],
611
- accept_multiple_files=True,
612
- on_change=set_Panel_1_Panel_2_Selected_false,
613
- )
614
-
615
- # Custom HTML for upload instructions
616
- recommendation_html = f"""
617
- <div style="text-align: justify;">
618
- <strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
619
- </div>
620
- """
621
- st.markdown(recommendation_html, unsafe_allow_html=True)
622
-
623
- # Choose Desired Granularity
624
- st.markdown("#### Choose Desired Granularity")
625
- # Granularity Selection
626
- granularity_selection = st.selectbox(
627
- "Choose Date Granularity",
628
- ["Daily", "Weekly", "Monthly"],
629
- label_visibility="collapsed",
630
- on_change=set_Panel_1_Panel_2_Selected_false,
631
- )
632
- granularity_selection = str(granularity_selection).lower()
633
-
634
- # Convert files to dataframes
635
- files_dict = files_to_dataframes(uploaded_files)
636
-
637
- # Add API Dataframe
638
- if main_df is not None:
639
- files_dict = add_api_dataframe_to_dict(main_df, files_dict)
640
-
641
- # Display a warning message if no files have been uploaded and halt further execution
642
- if not files_dict:
643
- st.warning(
644
- "Please upload at least one file to proceed.",
645
- icon="⚠️",
646
- )
647
- st.stop() # Halts further execution until file is uploaded
648
-
649
-
650
- # Select Panel_1 and Panel_2 columns
651
- st.markdown("#### Select Panel columns")
652
- selections = {}
653
- with st.expander("Select Panel columns", expanded=False):
654
- count = 0 # Initialize counter to manage the visibility of labels and keys
655
- for file_name, file_data in files_dict.items():
656
- # Determine visibility of the label based on the count
657
- if count == 0:
658
- label_visibility = "visible"
659
- else:
660
- label_visibility = "collapsed"
661
-
662
- # Extract non-numeric columns
663
- non_numeric_cols = file_data["non_numeric"]
664
-
665
- # Prepare Panel_1 and Panel_2 values for dropdown, adding "N/A" as an option
666
- panel1_values = non_numeric_cols + ["N/A"]
667
- panel2_values = non_numeric_cols + ["N/A"]
668
-
669
- # Skip if only one option is available
670
- if len(panel1_values) == 1 and len(panel2_values) == 1:
671
- selected_panel1, selected_panel2 = "N/A", "N/A"
672
- # Update the selections for Panel_1 and Panel_2 for the current file
673
- selections[file_name] = {
674
- "Panel_1": selected_panel1,
675
- "Panel_2": selected_panel2,
676
- }
677
- continue
678
-
679
- # Create layout columns for File Name, Panel_2, and Panel_1 selections
680
- file_name_col, Panel_1_col, Panel_2_col = st.columns([2, 4, 4])
681
-
682
- with file_name_col:
683
- # Display "File Name" label only for the first file
684
- if count == 0:
685
- st.write("File Name")
686
- else:
687
- st.write("")
688
- st.write(file_name) # Display the file name
689
-
690
- with Panel_1_col:
691
- # Display a selectbox for Panel_1 values
692
- selected_panel1 = st.selectbox(
693
- "Select Panel Level 1",
694
- panel2_values,
695
- on_change=set_Panel_1_Panel_2_Selected_false,
696
- label_visibility=label_visibility, # Control visibility of the label
697
- key=f"Panel_1_selectbox{count}", # Ensure unique key for each selectbox
698
- )
699
-
700
- with Panel_2_col:
701
- # Display a selectbox for Panel_2 values
702
- selected_panel2 = st.selectbox(
703
- "Select Panel Level 2",
704
- panel1_values,
705
- on_change=set_Panel_1_Panel_2_Selected_false,
706
- label_visibility=label_visibility, # Control visibility of the label
707
- key=f"Panel_2_selectbox{count}", # Ensure unique key for each selectbox
708
- )
709
-
710
- # Skip processing if the same column is selected for both Panel_1 and Panel_2 due to potential data integrity issues
711
- if selected_panel2 == selected_panel1 and not (
712
- selected_panel2 == "N/A" and selected_panel1 == "N/A"
713
- ):
714
- st.warning(
715
- f"File: {file_name} → The same column cannot serve as both Panel_1 and Panel_2. Please adjust your selections.",
716
- )
717
- selected_panel1, selected_panel2 = "N/A", "N/A"
718
- st.stop()
719
-
720
- # Update the selections for Panel_1 and Panel_2 for the current file
721
- selections[file_name] = {
722
- "Panel_1": selected_panel1,
723
- "Panel_2": selected_panel2,
724
- }
725
-
726
- count += 1 # Increment the counter after processing each file
727
-
728
- # Accept Panel_1 and Panel_2 selection
729
- if st.button("Accept and Process", use_container_width=True):
730
-
731
- # Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
732
- with st.spinner("Processing...", cache=True):
733
- files_dict = standardize_data_to_daily(files_dict, selections)
734
-
735
- # Convert all data to daily level granularity
736
- files_dict = apply_granularity_to_all(
737
- files_dict, granularity_selection, selections
738
- )
739
-
740
- # Update the 'files_dict' in the session state
741
- st.session_state["files_dict"] = files_dict
742
-
743
- # Set a flag in the session state to indicate that selection has been made
744
- st.session_state["Panel_1_Panel_2_Selected"] = True
745
-
746
-
747
- #########################################################################################################################################################
748
- # Display unique Panel_1 and Panel_2 values
749
- #########################################################################################################################################################
750
-
751
-
752
- # Halts further execution until Panel_1 and Panel_2 columns are selected
753
- if "files_dict" in st.session_state and st.session_state["Panel_1_Panel_2_Selected"]:
754
- files_dict = st.session_state["files_dict"]
755
- else:
756
- st.stop()
757
-
758
- # Set to store unique values of Panel_1 and Panel_2
759
- with st.spinner("Fetching Panel values..."):
760
- all_panel1_values, all_panel2_values = clean_and_extract_unique_values(
761
- files_dict, selections
762
- )
763
-
764
- # List of Panel_1 and Panel_2 columns unique values
765
- list_of_all_panel1_values = list(all_panel1_values)
766
- list_of_all_panel2_values = list(all_panel2_values)
767
-
768
- # Format Panel_1 and Panel_2 values for display
769
- formatted_panel1_values = format_values_for_display(list_of_all_panel1_values)
770
- formatted_panel2_values = format_values_for_display(list_of_all_panel2_values)
771
-
772
- # Unique Panel_1 and Panel_2 values
773
- st.markdown("#### Unique Panel values")
774
- # Display Panel_1 and Panel_2 values
775
- with st.expander("Unique Panel values"):
776
- st.write("")
777
- st.markdown(
778
- f"""
779
- <style>
780
- .justify-text {{
781
- text-align: justify;
782
- }}
783
- </style>
784
- <div class="justify-text">
785
- <strong>Panel Level 1 Values:</strong> {formatted_panel1_values}<br>
786
- <strong>Panel Level 2 Values:</strong> {formatted_panel2_values}
787
- </div>
788
- """,
789
- unsafe_allow_html=True,
790
- )
791
-
792
- # Display total Panel_1 and Panel_2
793
- st.write("")
794
- st.markdown(
795
- f"""
796
- <div style="text-align: justify;">
797
- <strong>Number of Level 1 Panels detected:</strong> {len(list_of_all_panel1_values)}<br>
798
- <strong>Number of Level 2 Panels detected:</strong> {len(list_of_all_panel2_values)}
799
- </div>
800
- """,
801
- unsafe_allow_html=True,
802
- )
803
- st.write("")
804
-
805
-
806
- #########################################################################################################################################################
807
- # Merge all DataFrames
808
- #########################################################################################################################################################
809
-
810
-
811
- # Merge all DataFrames selected
812
- main_df = create_main_dataframe(
813
- files_dict, all_panel1_values, all_panel2_values, granularity_selection
814
- )
815
- merged_df = merge_into_main_df(main_df, files_dict, selections)
816
-
817
-
818
- #########################################################################################################################################################
819
- # Categorize Variables and Impute Missing Values
820
- #########################################################################################################################################################
821
-
822
-
823
- # Create an editable DataFrame in Streamlit
824
- st.markdown("#### Select Variables Category & Impute Missing Values")
825
-
826
- # Prepare missing stats DataFrame for editing
827
- missing_stats_df = prepare_missing_stats_df(merged_df)
828
-
829
- edited_stats_df = st.data_editor(
830
- missing_stats_df,
831
- column_config={
832
- "Impute Method": st.column_config.SelectboxColumn(
833
- options=[
834
- "Drop Column",
835
- "Fill with Mean",
836
- "Fill with Median",
837
- "Fill with 0",
838
- ],
839
- required=True,
840
- default="Fill with 0",
841
- ),
842
- "Category": st.column_config.SelectboxColumn(
843
- options=[
844
- "Media",
845
- "Exogenous",
846
- "Internal",
847
- "Response Metrics",
848
- ],
849
- required=True,
850
- default="Media",
851
- ),
852
- },
853
- disabled=["Column", "Missing Values", "Missing Percentage"],
854
- hide_index=True,
855
- use_container_width=True,
856
- )
857
-
858
- # Apply changes based on edited DataFrame
859
- for i, row in edited_stats_df.iterrows():
860
- column = row["Column"]
861
- if row["Impute Method"] == "Drop Column":
862
- merged_df.drop(columns=[column], inplace=True)
863
-
864
- elif row["Impute Method"] == "Fill with Mean":
865
- merged_df[column].fillna(merged_df[column].mean(), inplace=True)
866
-
867
- elif row["Impute Method"] == "Fill with Median":
868
- merged_df[column].fillna(merged_df[column].median(), inplace=True)
869
-
870
- elif row["Impute Method"] == "Fill with 0":
871
- merged_df[column].fillna(0, inplace=True)
872
-
873
-
874
- #########################################################################################################################################################
875
- # Group columns
876
- #########################################################################################################################################################
877
-
878
-
879
- # Display Group columns header
880
- st.markdown("#### Feature engineering")
881
-
882
- # Prepare the numeric columns and an empty DataFrame for user input
883
- numeric_columns, default_df = prepare_numeric_columns_and_default_df(
884
- merged_df, edited_stats_df
885
- )
886
-
887
- # Display editable Dataframe
888
- edited_df = st.data_editor(
889
- default_df,
890
- column_config={
891
- "Column 1": st.column_config.SelectboxColumn(
892
- options=numeric_columns,
893
- required=True,
894
- default=numeric_columns[0],
895
- width=400,
896
- ),
897
- "Operator": st.column_config.SelectboxColumn(
898
- options=["+", "-", "*", "/"],
899
- required=True,
900
- default="+",
901
- width=100,
902
- ),
903
- "Column 2": st.column_config.SelectboxColumn(
904
- options=numeric_columns,
905
- required=True,
906
- default=numeric_columns[0],
907
- width=400,
908
- ),
909
- "Category": st.column_config.SelectboxColumn(
910
- options=[
911
- "Media",
912
- "Exogenous",
913
- "Internal",
914
- "Response Metrics",
915
- ],
916
- required=True,
917
- default="Media",
918
- width=200,
919
- ),
920
- },
921
- num_rows="dynamic",
922
- )
923
-
924
- # Process the DataFrame based on user inputs and operations specified in edited_df
925
- final_df, edited_stats_df = process_dataframes(merged_df, edited_df, edited_stats_df)
926
-
927
-
928
- #########################################################################################################################################################
929
- # Display the Final DataFrame and variables
930
- #########################################################################################################################################################
931
-
932
-
933
- # Display the Final DataFrame and variables
934
- st.markdown("#### Final DataFrame")
935
- st.dataframe(final_df, hide_index=True)
936
-
937
- # Initialize an empty dictionary to hold categories and their variables
938
- category_dict = {}
939
-
940
- # Iterate over each row in the edited DataFrame to populate the dictionary
941
- for i, row in edited_stats_df.iterrows():
942
- column = row["Column"]
943
- category = row["Category"] # The category chosen by the user for this variable
944
-
945
- # Check if the category already exists in the dictionary
946
- if category not in category_dict:
947
- # If not, initialize it with the current column as its first element
948
- category_dict[category] = [column]
949
- else:
950
- # If it exists, append the current column to the list of variables under this category
951
- category_dict[category].append(column)
952
-
953
- # Add Date, Panel_1 and Panel_12 in category dictionary
954
- category_dict.update({"Date": ["date"]})
955
- if "Panel_1" in final_df.columns:
956
- category_dict["Panel Level 1"] = ["Panel_1"]
957
- if "Panel_2" in final_df.columns:
958
- category_dict["Panel Level 2"] = ["Panel_2"]
959
-
960
- # Display the dictionary
961
- st.markdown("#### Variable Category")
962
- for category, variables in category_dict.items():
963
- # Check if there are multiple variables to handle "and" insertion correctly
964
- if len(variables) > 1:
965
- # Join all but the last variable with ", ", then add " and " before the last variable
966
- variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
967
- else:
968
- # If there's only one variable, no need for "and"
969
- variables_str = variables[0]
970
-
971
- # Display the category and its variables in the desired format
972
- st.markdown(
973
- f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
974
- unsafe_allow_html=True,
975
- )
976
-
977
- # Function to check if Response Metrics is selected
978
- st.write("")
979
- response_metrics_col = category_dict.get("Response Metrics", [])
980
- if len(response_metrics_col) == 0:
981
- st.warning("Please select Response Metrics column", icon="⚠️")
982
- st.stop()
983
- # elif len(response_metrics_col) > 1:
984
- # st.warning("Please select only one Response Metrics column", icon="⚠️")
985
- # st.stop()
986
-
987
- # Store final dataframe and bin dictionary into session state
988
- st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
989
-
990
- # Save the DataFrame and dictionary from the session state to the pickle file
991
- if st.button("Accept and Save", use_container_width=True):
992
- save_to_pickle(
993
- "data_import.pkl", st.session_state["final_df"], st.session_state["bin_dict"]
994
- )
995
- st.toast("💾 Saved Successfully!")