BlendMMM commited on
Commit
13a2588
·
verified ·
1 Parent(s): 4043b0d

Delete Data_Import .py

Browse files
Files changed (1) hide show
  1. Data_Import .py +0 -1019
Data_Import .py DELETED
@@ -1,1019 +0,0 @@
1
- # Importing necessary libraries
2
- import streamlit as st
3
-
4
- st.set_page_config(
5
- page_title="Data Import",
6
- page_icon=":shark:",
7
- layout="wide",
8
- initial_sidebar_state="collapsed",
9
- )
10
-
11
- import pickle
12
- import pandas as pd
13
- from utilities import set_header, load_local_css
14
- import streamlit_authenticator as stauth
15
- import yaml
16
- from yaml import SafeLoader
17
-
18
- load_local_css("styles.css")
19
- set_header()
20
-
21
-
22
- for k, v in st.session_state.items():
23
- if k not in ["logout", "login", "config"] and not k.startswith(
24
- "FormSubmitter"
25
- ):
26
- st.session_state[k] = v
27
- with open("config.yaml") as file:
28
- config = yaml.load(file, Loader=SafeLoader)
29
- st.session_state["config"] = config
30
- authenticator = stauth.Authenticate(
31
- config["credentials"],
32
- config["cookie"]["name"],
33
- config["cookie"]["key"],
34
- config["cookie"]["expiry_days"],
35
- config["preauthorized"],
36
- )
37
- st.session_state["authenticator"] = authenticator
38
- name, authentication_status, username = authenticator.login("Login", "main")
39
- auth_status = st.session_state.get("authentication_status")
40
-
41
- if auth_status == True:
42
- authenticator.logout("Logout", "main")
43
- is_state_initiaized = st.session_state.get("initialized", False)
44
-
45
- if not is_state_initiaized:
46
-
47
- if 'session_name' not in st.session_state:
48
- st.session_state['session_name']=None
49
-
50
-
51
- # Function to validate date column in dataframe
52
- def validate_date_column(df):
53
- try:
54
- # Attempt to convert the 'Date' column to datetime
55
- df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
56
- return True
57
- except:
58
- return False
59
-
60
-
61
- # Function to determine data interval
62
- def determine_data_interval(common_freq):
63
- if common_freq == 1:
64
- return "daily"
65
- elif common_freq == 7:
66
- return "weekly"
67
- elif 28 <= common_freq <= 31:
68
- return "monthly"
69
- else:
70
- return "irregular"
71
-
72
-
73
- # Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
74
- st.cache_resource(show_spinner=False)
75
-
76
-
77
- def files_to_dataframes(uploaded_files):
78
- df_dict = {}
79
- for uploaded_file in uploaded_files:
80
- # Extract file name without extension
81
- file_name = uploaded_file.name.rsplit(".", 1)[0]
82
-
83
- # Check for duplicate file names
84
- if file_name in df_dict:
85
- st.warning(
86
- f"Duplicate File: {file_name}. This file will be skipped.",
87
- icon="⚠️",
88
- )
89
- continue
90
-
91
- # Read the file into a DataFrame
92
- df = pd.read_excel(uploaded_file)
93
-
94
- # Convert all column names to lowercase
95
- df.columns = df.columns.str.lower().str.strip()
96
-
97
- # Separate numeric and non-numeric columns
98
- numeric_cols = list(df.select_dtypes(include=["number"]).columns)
99
- non_numeric_cols = [
100
- col
101
- for col in df.select_dtypes(exclude=["number"]).columns
102
- if col.lower() != "date"
103
- ]
104
-
105
- # Check for 'Date' column
106
- if not (validate_date_column(df) and len(numeric_cols) > 0):
107
- st.warning(
108
- f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
109
- icon="⚠️",
110
- )
111
- continue
112
-
113
- # Check for interval
114
- common_freq = common_freq = (
115
- pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
116
- )
117
- # Calculate the data interval (daily, weekly, monthly or irregular)
118
- interval = determine_data_interval(common_freq)
119
- if interval == "irregular":
120
- st.warning(
121
- f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
122
- icon="⚠️",
123
- )
124
- continue
125
-
126
- # Store both DataFrames in the dictionary under their respective keys
127
- df_dict[file_name] = {
128
- "numeric": numeric_cols,
129
- "non_numeric": non_numeric_cols,
130
- "interval": interval,
131
- "df": df,
132
- }
133
-
134
- return df_dict
135
-
136
-
137
- # Function to adjust dataframe granularity
138
- def adjust_dataframe_granularity(df, current_granularity, target_granularity):
139
- # Set index
140
- df.set_index("date", inplace=True)
141
-
142
- # Define aggregation rules for resampling
143
- aggregation_rules = {
144
- col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
145
- for col in df.columns
146
- }
147
-
148
- # Initialize resampled_df
149
- resampled_df = df
150
- if current_granularity == "daily" and target_granularity == "weekly":
151
- resampled_df = df.resample("W-MON", closed="left", label="left").agg(
152
- aggregation_rules
153
- )
154
-
155
- elif current_granularity == "daily" and target_granularity == "monthly":
156
- resampled_df = df.resample("MS", closed="left", label="left").agg(
157
- aggregation_rules
158
- )
159
-
160
- elif current_granularity == "daily" and target_granularity == "daily":
161
- resampled_df = df.resample("D").agg(aggregation_rules)
162
-
163
- elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
164
- # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
165
- expanded_data = []
166
- for _, row in df.iterrows():
167
- if current_granularity == "weekly":
168
- period_range = pd.date_range(start=row.name, periods=7)
169
- elif current_granularity == "monthly":
170
- period_range = pd.date_range(
171
- start=row.name, periods=row.name.days_in_month
172
- )
173
-
174
- for date in period_range:
175
- new_row = {}
176
- for col in df.columns:
177
- if pd.api.types.is_numeric_dtype(df[col]):
178
- if current_granularity == "weekly":
179
- new_row[col] = row[col] / 7
180
- elif current_granularity == "monthly":
181
- new_row[col] = row[col] / row.name.days_in_month
182
- else:
183
- new_row[col] = row[col]
184
- expanded_data.append((date, new_row))
185
-
186
- resampled_df = pd.DataFrame(
187
- [data for _, data in expanded_data],
188
- index=[date for date, _ in expanded_data],
189
- )
190
-
191
- # Reset index
192
- resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
193
-
194
- return resampled_df
195
-
196
-
197
- # Function to clean and extract unique values of Panel_1 and Panel_2
198
- st.cache_resource(show_spinner=False)
199
-
200
-
201
- def clean_and_extract_unique_values(files_dict, selections):
202
- all_panel1_values = set()
203
- all_panel2_values = set()
204
-
205
- for file_name, file_data in files_dict.items():
206
- df = file_data["df"]
207
-
208
- # 'Panel_1' and 'Panel_2' selections
209
- selected_panel1 = selections[file_name].get("Panel_1")
210
- selected_panel2 = selections[file_name].get("Panel_2")
211
-
212
- # Clean and standardize Panel_1 column if it exists and is selected
213
- if (
214
- selected_panel1
215
- and selected_panel1 != "N/A"
216
- and selected_panel1 in df.columns
217
- ):
218
- df[selected_panel1] = (
219
- df[selected_panel1].str.lower().str.strip().str.replace("_", " ")
220
- )
221
- all_panel1_values.update(df[selected_panel1].dropna().unique())
222
-
223
- # Clean and standardize Panel_2 column if it exists and is selected
224
- if (
225
- selected_panel2
226
- and selected_panel2 != "N/A"
227
- and selected_panel2 in df.columns
228
- ):
229
- df[selected_panel2] = (
230
- df[selected_panel2].str.lower().str.strip().str.replace("_", " ")
231
- )
232
- all_panel2_values.update(df[selected_panel2].dropna().unique())
233
-
234
- # Update the processed DataFrame back in the dictionary
235
- files_dict[file_name]["df"] = df
236
-
237
- return all_panel1_values, all_panel2_values
238
-
239
-
240
- # Function to format values for display
241
- st.cache_resource(show_spinner=False)
242
-
243
-
244
- def format_values_for_display(values_list):
245
- # Capitalize the first letter of each word and replace underscores with spaces
246
- formatted_list = [value.replace("_", " ").title() for value in values_list]
247
- # Join values with commas and 'and' before the last value
248
- if len(formatted_list) > 1:
249
- return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
250
- elif formatted_list:
251
- return formatted_list[0]
252
- return "No values available"
253
-
254
-
255
- # Function to normalizes all data within files_dict to a daily granularity
256
- st.cache(show_spinner=False, allow_output_mutation=True)
257
-
258
-
259
- def standardize_data_to_daily(files_dict, selections):
260
- # Normalize all data to a daily granularity using a provided function
261
- files_dict = apply_granularity_to_all(files_dict, "daily", selections)
262
-
263
- # Update the "interval" attribute for each dataset to indicate the new granularity
264
- for files_name, files_data in files_dict.items():
265
- files_data["interval"] = "daily"
266
-
267
- return files_dict
268
-
269
-
270
- # Function to apply granularity transformation to all DataFrames in files_dict
271
- st.cache_resource(show_spinner=False)
272
-
273
-
274
- def apply_granularity_to_all(files_dict, granularity_selection, selections):
275
- for file_name, file_data in files_dict.items():
276
- df = file_data["df"].copy()
277
-
278
- # Handling when Panel_1 or Panel_2 might be 'N/A'
279
- selected_panel1 = selections[file_name].get("Panel_1")
280
- selected_panel2 = selections[file_name].get("Panel_2")
281
-
282
- # Correcting the segment selection logic & handling 'N/A'
283
- if selected_panel1 != "N/A" and selected_panel2 != "N/A":
284
- unique_combinations = df[
285
- [selected_panel1, selected_panel2]
286
- ].drop_duplicates()
287
- elif selected_panel1 != "N/A":
288
- unique_combinations = df[[selected_panel1]].drop_duplicates()
289
- selected_panel2 = None # Ensure Panel_2 is ignored if N/A
290
- elif selected_panel2 != "N/A":
291
- unique_combinations = df[[selected_panel2]].drop_duplicates()
292
- selected_panel1 = None # Ensure Panel_1 is ignored if N/A
293
- else:
294
- # If both are 'N/A', process the entire dataframe as is
295
- df = adjust_dataframe_granularity(
296
- df, file_data["interval"], granularity_selection
297
- )
298
- files_dict[file_name]["df"] = df
299
- continue # Skip to the next file
300
-
301
- transformed_segments = []
302
- for _, combo in unique_combinations.iterrows():
303
- if selected_panel1 and selected_panel2:
304
- segment = df[
305
- (df[selected_panel1] == combo[selected_panel1])
306
- & (df[selected_panel2] == combo[selected_panel2])
307
- ]
308
- elif selected_panel1:
309
- segment = df[df[selected_panel1] == combo[selected_panel1]]
310
- elif selected_panel2:
311
- segment = df[df[selected_panel2] == combo[selected_panel2]]
312
-
313
- # Adjust granularity of the segment
314
- transformed_segment = adjust_dataframe_granularity(
315
- segment, file_data["interval"], granularity_selection
316
- )
317
- transformed_segments.append(transformed_segment)
318
-
319
- # Combine all transformed segments into a single DataFrame for this file
320
- transformed_df = pd.concat(transformed_segments, ignore_index=True)
321
- files_dict[file_name]["df"] = transformed_df
322
-
323
- return files_dict
324
-
325
-
326
- # Function to create main dataframe structure
327
- st.cache_resource(show_spinner=False)
328
-
329
-
330
- def create_main_dataframe(
331
- files_dict, all_panel1_values, all_panel2_values, granularity_selection
332
- ):
333
- # Determine the global start and end dates across all DataFrames
334
- global_start = min(df["df"]["date"].min() for df in files_dict.values())
335
- global_end = max(df["df"]["date"].max() for df in files_dict.values())
336
-
337
- # Adjust the date_range generation based on the granularity_selection
338
- if granularity_selection == "weekly":
339
- # Generate a weekly range, with weeks starting on Monday
340
- date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
341
- elif granularity_selection == "monthly":
342
- # Generate a monthly range, starting from the first day of each month
343
- date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
344
- else: # Default to daily if not weekly or monthly
345
- date_range = pd.date_range(start=global_start, end=global_end, freq="D")
346
-
347
- # Collect all unique Panel_1 and Panel_2 values, excluding 'N/A'
348
- all_panel1s = all_panel1_values
349
- all_panel2s = all_panel2_values
350
-
351
- # Dynamically build the list of dimensions (Panel_1, Panel_2) to include in the main DataFrame based on availability
352
- dimensions, merge_keys = [], []
353
- if all_panel1s:
354
- dimensions.append(all_panel1s)
355
- merge_keys.append("Panel_1")
356
- if all_panel2s:
357
- dimensions.append(all_panel2s)
358
- merge_keys.append("Panel_2")
359
-
360
- dimensions.append(date_range) # Date range is always included
361
- merge_keys.append("date") # Date range is always included
362
-
363
- # Create a main DataFrame template with the dimensions
364
- main_df = pd.MultiIndex.from_product(
365
- dimensions,
366
- names=[name for name, _ in zip(merge_keys, dimensions)],
367
- ).to_frame(index=False)
368
-
369
- return main_df.reset_index(drop=True)
370
-
371
-
372
- # Function to prepare and merge dataFrames
373
- st.cache_resource(show_spinner=False)
374
-
375
-
376
- def merge_into_main_df(main_df, files_dict, selections):
377
- for file_name, file_data in files_dict.items():
378
- df = file_data["df"].copy()
379
-
380
- # Rename selected Panel_1 and Panel_2 columns if not 'N/A'
381
- selected_panel1 = selections[file_name].get("Panel_1", "N/A")
382
- selected_panel2 = selections[file_name].get("Panel_2", "N/A")
383
- if selected_panel1 != "N/A":
384
- df.rename(columns={selected_panel1: "Panel_1"}, inplace=True)
385
- if selected_panel2 != "N/A":
386
- df.rename(columns={selected_panel2: "Panel_2"}, inplace=True)
387
-
388
- # Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel_1' and 'Panel_2'
389
- merge_keys = ["date"]
390
- if "Panel_1" in df.columns:
391
- merge_keys.append("Panel_1")
392
- if "Panel_2" in df.columns:
393
- merge_keys.append("Panel_2")
394
- main_df = pd.merge(main_df, df, on=merge_keys, how="left")
395
-
396
- # After all merges, sort by 'date' and reset index for cleanliness
397
- sort_by = ["date"]
398
- if "Panel_1" in main_df.columns:
399
- sort_by.append("Panel_1")
400
- if "Panel_2" in main_df.columns:
401
- sort_by.append("Panel_2")
402
- main_df.sort_values(by=sort_by, inplace=True)
403
- main_df.reset_index(drop=True, inplace=True)
404
-
405
- return main_df
406
-
407
-
408
- # Function to categorize column
409
- def categorize_column(column_name):
410
- # Define keywords for each category
411
- internal_keywords = [
412
- "Price",
413
- "Discount",
414
- "product_price",
415
- "cost",
416
- "margin",
417
- "inventory",
418
- "sales",
419
- "revenue",
420
- "turnover",
421
- "expense",
422
- ]
423
- exogenous_keywords = [
424
- "GDP",
425
- "Tax",
426
- "Inflation",
427
- "interest_rate",
428
- "employment_rate",
429
- "exchange_rate",
430
- "consumer_spending",
431
- "retail_sales",
432
- "oil_prices",
433
- "weather",
434
- ]
435
-
436
- # Check if the column name matches any of the keywords for Internal or Exogenous categories
437
- for keyword in internal_keywords:
438
- if keyword.lower() in column_name.lower():
439
- return "Internal"
440
- for keyword in exogenous_keywords:
441
- if keyword.lower() in column_name.lower():
442
- return "Exogenous"
443
-
444
- # Default to Media if no match found
445
- return "Media"
446
-
447
-
448
- # Function to calculate missing stats and prepare for editable DataFrame
449
- st.cache_resource(show_spinner=False)
450
-
451
-
452
- def prepare_missing_stats_df(df):
453
- missing_stats = []
454
- for column in df.columns:
455
- if (
456
- column == "date" or column == "Panel_2" or column == "Panel_1"
457
- ): # Skip Date, Panel_1 and Panel_2 column
458
- continue
459
-
460
- missing = df[column].isnull().sum()
461
- pct_missing = round((missing / len(df)) * 100, 2)
462
-
463
- # Dynamically assign category based on column name
464
- category = categorize_column(column)
465
- # category = "Media" # Keep default bin as Media
466
-
467
- missing_stats.append(
468
- {
469
- "Column": column,
470
- "Missing Values": missing,
471
- "Missing Percentage": pct_missing,
472
- "Impute Method": "Fill with 0", # Default value
473
- "Category": category,
474
- }
475
- )
476
- stats_df = pd.DataFrame(missing_stats)
477
-
478
- return stats_df
479
-
480
-
481
- # Function to add API DataFrame details to the files dictionary
482
- st.cache_resource(show_spinner=False)
483
-
484
-
485
- def add_api_dataframe_to_dict(main_df, files_dict):
486
- files_dict["API"] = {
487
- "numeric": list(main_df.select_dtypes(include=["number"]).columns),
488
- "non_numeric": [
489
- col
490
- for col in main_df.select_dtypes(exclude=["number"]).columns
491
- if col.lower() != "date"
492
- ],
493
- "interval": determine_data_interval(
494
- pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
495
- ),
496
- "df": main_df,
497
- }
498
-
499
- return files_dict
500
-
501
-
502
- # Function to reads an API into a DataFrame, parsing specified columns as datetime
503
- @st.cache_resource(show_spinner=False)
504
- def read_API_data():
505
- return pd.read_excel(r".\upf_data_converted_randomized_resp_metrics.xlsx", parse_dates=["Date"])
506
-
507
-
508
- # Function to set the 'Panel_1_Panel_2_Selected' session state variable to False
509
- def set_Panel_1_Panel_2_Selected_false():
510
- st.session_state["Panel_1_Panel_2_Selected"] = False
511
-
512
-
513
- # Function to serialize and save the objects into a pickle file
514
- @st.cache_resource(show_spinner=False)
515
- def save_to_pickle(file_path, final_df, bin_dict):
516
- # Open the file in write-binary mode and dump the objects
517
- with open(file_path, "wb") as f:
518
- pickle.dump({"final_df": final_df, "bin_dict": bin_dict}, f)
519
- # Data is now saved to file
520
-
521
-
522
- # Function to processes the merged_df DataFrame based on operations defined in edited_df
523
- @st.cache_resource(show_spinner=False)
524
- def process_dataframes(merged_df, edited_df, edited_stats_df):
525
- # Ensure there are operations defined by the user
526
- if edited_df.empty:
527
- return merged_df, edited_stats_df # No operations to apply
528
-
529
- # Perform operations as defined by the user
530
- for index, row in edited_df.iterrows():
531
- result_column_name = f"{row['Column 1']}{row['Operator']}{row['Column 2']}"
532
- col1 = row["Column 1"]
533
- col2 = row["Column 2"]
534
- op = row["Operator"]
535
-
536
- # Apply the specified operation
537
- if op == "+":
538
- merged_df[result_column_name] = merged_df[col1] + merged_df[col2]
539
- elif op == "-":
540
- merged_df[result_column_name] = merged_df[col1] - merged_df[col2]
541
- elif op == "*":
542
- merged_df[result_column_name] = merged_df[col1] * merged_df[col2]
543
- elif op == "/":
544
- merged_df[result_column_name] = merged_df[col1] / merged_df[col2].replace(
545
- 0, 1e-9
546
- )
547
-
548
- # Add summary of operation to edited_stats_df
549
- new_row = {
550
- "Column": result_column_name,
551
- "Missing Values": None,
552
- "Missing Percentage": None,
553
- "Impute Method": None,
554
- "Category": row["Category"],
555
- }
556
- new_row_df = pd.DataFrame([new_row])
557
-
558
- # Use pd.concat to add the new_row_df to edited_stats_df
559
- edited_stats_df = pd.concat(
560
- [edited_stats_df, new_row_df], ignore_index=True, axis=0
561
- )
562
-
563
- # Combine column names from edited_df for cleanup
564
- combined_columns = set(edited_df["Column 1"]).union(set(edited_df["Column 2"]))
565
-
566
- # Filter out rows in edited_stats_df and drop columns from merged_df
567
- edited_stats_df = edited_stats_df[~edited_stats_df["Column"].isin(combined_columns)]
568
- merged_df.drop(columns=list(combined_columns), errors="ignore", inplace=True)
569
-
570
- return merged_df, edited_stats_df
571
-
572
-
573
- # Function to prepare a list of numeric column names and initialize an empty DataFrame with predefined structure
574
- st.cache_resource(show_spinner=False)
575
-
576
-
577
- def prepare_numeric_columns_and_default_df(merged_df, edited_stats_df):
578
- # Get columns categorized as 'Response Metrics'
579
- columns_response_metrics = edited_stats_df[
580
- edited_stats_df["Category"] == "Response Metrics"
581
- ]["Column"].tolist()
582
-
583
- # Filter numeric columns, excluding those categorized as 'Response Metrics'
584
- numeric_columns = [
585
- col
586
- for col in merged_df.select_dtypes(include=["number"]).columns
587
- if col not in columns_response_metrics
588
- ]
589
-
590
- # Define the structure of the empty DataFrame
591
- data = {
592
- "Column 1": pd.Series([], dtype="str"),
593
- "Operator": pd.Series([], dtype="str"),
594
- "Column 2": pd.Series([], dtype="str"),
595
- "Category": pd.Series([], dtype="str"),
596
- }
597
- default_df = pd.DataFrame(data)
598
-
599
- return numeric_columns, default_df
600
-
601
-
602
- # Initialize 'final_df' in session state
603
- if "final_df" not in st.session_state:
604
- st.session_state["final_df"] = pd.DataFrame()
605
-
606
- # Initialize 'bin_dict' in session state
607
- if "bin_dict" not in st.session_state:
608
- st.session_state["bin_dict"] = {}
609
-
610
- # Initialize 'Panel_1_Panel_2_Selected' in session state
611
- if "Panel_1_Panel_2_Selected" not in st.session_state:
612
- st.session_state["Panel_1_Panel_2_Selected"] = False
613
-
614
-
615
- # Page Title
616
- st.write("") # Top padding
617
- st.title("Data Import")
618
-
619
-
620
- #########################################################################################################################################################
621
- # Create a dictionary to hold all DataFrames and collect user input to specify "Panel_2" and "Panel_1" columns for each file
622
- #########################################################################################################################################################
623
-
624
-
625
- # Read the Excel file, parsing 'Date' column as datetime
626
- main_df = read_API_data()
627
-
628
- # Convert all column names to lowercase
629
- main_df.columns = main_df.columns.str.lower().str.strip()
630
-
631
- # File uploader
632
- uploaded_files = st.file_uploader(
633
- "Upload additional data",
634
- type=["xlsx"],
635
- accept_multiple_files=True,
636
- on_change=set_Panel_1_Panel_2_Selected_false,
637
- )
638
-
639
- # Custom HTML for upload instructions
640
- recommendation_html = f"""
641
- <div style="text-align: justify;">
642
- <strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
643
- </div>
644
- """
645
- st.markdown(recommendation_html, unsafe_allow_html=True)
646
-
647
- # Choose Desired Granularity
648
- st.markdown("#### Choose Desired Granularity")
649
- # Granularity Selection
650
- granularity_selection = st.selectbox(
651
- "Choose Date Granularity",
652
- ["Daily", "Weekly", "Monthly"],
653
- label_visibility="collapsed",
654
- on_change=set_Panel_1_Panel_2_Selected_false,
655
- )
656
- granularity_selection = str(granularity_selection).lower()
657
-
658
- # Convert files to dataframes
659
- files_dict = files_to_dataframes(uploaded_files)
660
-
661
- # Add API Dataframe
662
- if main_df is not None:
663
- files_dict = add_api_dataframe_to_dict(main_df, files_dict)
664
-
665
- # Display a warning message if no files have been uploaded and halt further execution
666
- if not files_dict:
667
- st.warning(
668
- "Please upload at least one file to proceed.",
669
- icon="⚠️",
670
- )
671
- st.stop() # Halts further execution until file is uploaded
672
-
673
-
674
- # Select Panel_1 and Panel_2 columns
675
- st.markdown("#### Select Panel columns")
676
- selections = {}
677
- with st.expander("Select Panel columns", expanded=False):
678
- count = 0 # Initialize counter to manage the visibility of labels and keys
679
- for file_name, file_data in files_dict.items():
680
- # Determine visibility of the label based on the count
681
- if count == 0:
682
- label_visibility = "visible"
683
- else:
684
- label_visibility = "collapsed"
685
-
686
- # Extract non-numeric columns
687
- non_numeric_cols = file_data["non_numeric"]
688
-
689
- # Prepare Panel_1 and Panel_2 values for dropdown, adding "N/A" as an option
690
- panel1_values = non_numeric_cols + ["N/A"]
691
- panel2_values = non_numeric_cols + ["N/A"]
692
-
693
- # Skip if only one option is available
694
- if len(panel1_values) == 1 and len(panel2_values) == 1:
695
- selected_panel1, selected_panel2 = "N/A", "N/A"
696
- # Update the selections for Panel_1 and Panel_2 for the current file
697
- selections[file_name] = {
698
- "Panel_1": selected_panel1,
699
- "Panel_2": selected_panel2,
700
- }
701
- continue
702
-
703
- # Create layout columns for File Name, Panel_2, and Panel_1 selections
704
- file_name_col, Panel_1_col, Panel_2_col = st.columns([2, 4, 4])
705
-
706
- with file_name_col:
707
- # Display "File Name" label only for the first file
708
- if count == 0:
709
- st.write("File Name")
710
- else:
711
- st.write("")
712
- st.write(file_name) # Display the file name
713
-
714
- with Panel_1_col:
715
- # Display a selectbox for Panel_1 values
716
- selected_panel1 = st.selectbox(
717
- "Select Panel Level 1",
718
- panel2_values,
719
- on_change=set_Panel_1_Panel_2_Selected_false,
720
- label_visibility=label_visibility, # Control visibility of the label
721
- key=f"Panel_1_selectbox{count}", # Ensure unique key for each selectbox
722
- )
723
-
724
- with Panel_2_col:
725
- # Display a selectbox for Panel_2 values
726
- selected_panel2 = st.selectbox(
727
- "Select Panel Level 2",
728
- panel1_values,
729
- on_change=set_Panel_1_Panel_2_Selected_false,
730
- label_visibility=label_visibility, # Control visibility of the label
731
- key=f"Panel_2_selectbox{count}", # Ensure unique key for each selectbox
732
- )
733
-
734
- # Skip processing if the same column is selected for both Panel_1 and Panel_2 due to potential data integrity issues
735
- if selected_panel2 == selected_panel1 and not (
736
- selected_panel2 == "N/A" and selected_panel1 == "N/A"
737
- ):
738
- st.warning(
739
- f"File: {file_name} → The same column cannot serve as both Panel_1 and Panel_2. Please adjust your selections.",
740
- )
741
- selected_panel1, selected_panel2 = "N/A", "N/A"
742
- st.stop()
743
-
744
- # Update the selections for Panel_1 and Panel_2 for the current file
745
- selections[file_name] = {
746
- "Panel_1": selected_panel1,
747
- "Panel_2": selected_panel2,
748
- }
749
-
750
- count += 1 # Increment the counter after processing each file
751
-
752
- # Accept Panel_1 and Panel_2 selection
753
- if st.button("Accept and Process", use_container_width=True):
754
-
755
- # Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
756
- with st.spinner("Processing..."):
757
- files_dict = standardize_data_to_daily(files_dict, selections)
758
-
759
- # Convert all data to daily level granularity
760
- files_dict = apply_granularity_to_all(
761
- files_dict, granularity_selection, selections
762
- )
763
-
764
- # Update the 'files_dict' in the session state
765
- st.session_state["files_dict"] = files_dict
766
-
767
- # Set a flag in the session state to indicate that selection has been made
768
- st.session_state["Panel_1_Panel_2_Selected"] = True
769
-
770
-
771
- #########################################################################################################################################################
772
- # Display unique Panel_1 and Panel_2 values
773
- #########################################################################################################################################################
774
-
775
-
776
- # Halts further execution until Panel_1 and Panel_2 columns are selected
777
- if "files_dict" in st.session_state and st.session_state["Panel_1_Panel_2_Selected"]:
778
- files_dict = st.session_state["files_dict"]
779
- else:
780
- st.stop()
781
-
782
- # Set to store unique values of Panel_1 and Panel_2
783
- with st.spinner("Fetching Panel values..."):
784
- all_panel1_values, all_panel2_values = clean_and_extract_unique_values(
785
- files_dict, selections
786
- )
787
-
788
- # List of Panel_1 and Panel_2 columns unique values
789
- list_of_all_panel1_values = list(all_panel1_values)
790
- list_of_all_panel2_values = list(all_panel2_values)
791
-
792
- # Format Panel_1 and Panel_2 values for display
793
- formatted_panel1_values = format_values_for_display(list_of_all_panel1_values)
794
- formatted_panel2_values = format_values_for_display(list_of_all_panel2_values)
795
-
796
- # Unique Panel_1 and Panel_2 values
797
- st.markdown("#### Unique Panel values")
798
- # Display Panel_1 and Panel_2 values
799
- with st.expander("Unique Panel values"):
800
- st.write("")
801
- st.markdown(
802
- f"""
803
- <style>
804
- .justify-text {{
805
- text-align: justify;
806
- }}
807
- </style>
808
- <div class="justify-text">
809
- <strong>Panel Level 1 Values:</strong> {formatted_panel1_values}<br>
810
- <strong>Panel Level 2 Values:</strong> {formatted_panel2_values}
811
- </div>
812
- """,
813
- unsafe_allow_html=True,
814
- )
815
-
816
- # Display total Panel_1 and Panel_2
817
- st.write("")
818
- st.markdown(
819
- f"""
820
- <div style="text-align: justify;">
821
- <strong>Number of Level 1 Panels detected:</strong> {len(list_of_all_panel1_values)}<br>
822
- <strong>Number of Level 2 Panels detected:</strong> {len(list_of_all_panel2_values)}
823
- </div>
824
- """,
825
- unsafe_allow_html=True,
826
- )
827
- st.write("")
828
-
829
-
830
- #########################################################################################################################################################
831
- # Merge all DataFrames
832
- #########################################################################################################################################################
833
-
834
-
835
- # Merge all DataFrames selected
836
- main_df = create_main_dataframe(
837
- files_dict, all_panel1_values, all_panel2_values, granularity_selection
838
- )
839
- merged_df = merge_into_main_df(main_df, files_dict, selections)
840
-
841
-
842
- #########################################################################################################################################################
843
- # Categorize Variables and Impute Missing Values
844
- #########################################################################################################################################################
845
-
846
-
847
- # Create an editable DataFrame in Streamlit
848
- st.markdown("#### Select Variables Category & Impute Missing Values")
849
-
850
- # Prepare missing stats DataFrame for editing
851
- missing_stats_df = prepare_missing_stats_df(merged_df)
852
-
853
- edited_stats_df = st.data_editor(
854
- missing_stats_df,
855
- column_config={
856
- "Impute Method": st.column_config.SelectboxColumn(
857
- options=[
858
- "Drop Column",
859
- "Fill with Mean",
860
- "Fill with Median",
861
- "Fill with 0",
862
- ],
863
- required=True,
864
- default="Fill with 0",
865
- ),
866
- "Category": st.column_config.SelectboxColumn(
867
- options=[
868
- "Media",
869
- "Exogenous",
870
- "Internal",
871
- "Response Metrics",
872
- ],
873
- required=True,
874
- default="Media",
875
- ),
876
- },
877
- disabled=["Column", "Missing Values", "Missing Percentage"],
878
- hide_index=True,
879
- use_container_width=True,
880
- )
881
-
882
- # Apply changes based on edited DataFrame
883
- for i, row in edited_stats_df.iterrows():
884
- column = row["Column"]
885
- if row["Impute Method"] == "Drop Column":
886
- merged_df.drop(columns=[column], inplace=True)
887
-
888
- elif row["Impute Method"] == "Fill with Mean":
889
- merged_df[column].fillna(merged_df[column].mean(), inplace=True)
890
-
891
- elif row["Impute Method"] == "Fill with Median":
892
- merged_df[column].fillna(merged_df[column].median(), inplace=True)
893
-
894
- elif row["Impute Method"] == "Fill with 0":
895
- merged_df[column].fillna(0, inplace=True)
896
-
897
-
898
- #########################################################################################################################################################
899
- # Group columns
900
- #########################################################################################################################################################
901
-
902
-
903
- # Display Group columns header
904
- st.markdown("#### Feature engineering")
905
-
906
- # Prepare the numeric columns and an empty DataFrame for user input
907
- numeric_columns, default_df = prepare_numeric_columns_and_default_df(
908
- merged_df, edited_stats_df
909
- )
910
-
911
- # Display editable Dataframe
912
- edited_df = st.data_editor(
913
- default_df,
914
- column_config={
915
- "Column 1": st.column_config.SelectboxColumn(
916
- options=numeric_columns,
917
- required=True,
918
- default=numeric_columns[0],
919
- width=400,
920
- ),
921
- "Operator": st.column_config.SelectboxColumn(
922
- options=["+", "-", "*", "/"],
923
- required=True,
924
- default="+",
925
- width=100,
926
- ),
927
- "Column 2": st.column_config.SelectboxColumn(
928
- options=numeric_columns,
929
- required=True,
930
- default=numeric_columns[0],
931
- width=400,
932
- ),
933
- "Category": st.column_config.SelectboxColumn(
934
- options=[
935
- "Media",
936
- "Exogenous",
937
- "Internal",
938
- "Response Metrics",
939
- ],
940
- required=True,
941
- default="Media",
942
- width=200,
943
- ),
944
- },
945
- num_rows="dynamic",
946
- )
947
-
948
- # Process the DataFrame based on user inputs and operations specified in edited_df
949
- final_df, edited_stats_df = process_dataframes(merged_df, edited_df, edited_stats_df)
950
-
951
-
952
- #########################################################################################################################################################
953
- # Display the Final DataFrame and variables
954
- #########################################################################################################################################################
955
-
956
-
957
- # Display the Final DataFrame and variables
958
- st.markdown("#### Final DataFrame")
959
- st.dataframe(final_df, hide_index=True)
960
-
961
- # Initialize an empty dictionary to hold categories and their variables
962
- category_dict = {}
963
-
964
- # Iterate over each row in the edited DataFrame to populate the dictionary
965
- for i, row in edited_stats_df.iterrows():
966
- column = row["Column"]
967
- category = row["Category"] # The category chosen by the user for this variable
968
-
969
- # Check if the category already exists in the dictionary
970
- if category not in category_dict:
971
- # If not, initialize it with the current column as its first element
972
- category_dict[category] = [column]
973
- else:
974
- # If it exists, append the current column to the list of variables under this category
975
- category_dict[category].append(column)
976
-
977
- # Add Date, Panel_1 and Panel_12 in category dictionary
978
- category_dict.update({"Date": ["date"]})
979
- if "Panel_1" in final_df.columns:
980
- category_dict["Panel Level 1"] = ["Panel_1"]
981
- if "Panel_2" in final_df.columns:
982
- category_dict["Panel Level 2"] = ["Panel_2"]
983
-
984
- # Display the dictionary
985
- st.markdown("#### Variable Category")
986
- for category, variables in category_dict.items():
987
- # Check if there are multiple variables to handle "and" insertion correctly
988
- if len(variables) > 1:
989
- # Join all but the last variable with ", ", then add " and " before the last variable
990
- variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
991
- else:
992
- # If there's only one variable, no need for "and"
993
- variables_str = variables[0]
994
-
995
- # Display the category and its variables in the desired format
996
- st.markdown(
997
- f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
998
- unsafe_allow_html=True,
999
- )
1000
-
1001
- # Function to check if Response Metrics is selected
1002
- st.write("")
1003
- response_metrics_col = category_dict.get("Response Metrics", [])
1004
- if len(response_metrics_col) == 0:
1005
- st.warning("Please select Response Metrics column", icon="⚠️")
1006
- st.stop()
1007
- # elif len(response_metrics_col) > 1:
1008
- # st.warning("Please select only one Response Metrics column", icon="⚠️")
1009
- # st.stop()
1010
-
1011
- # Store final dataframe and bin dictionary into session state
1012
- st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
1013
-
1014
- # Save the DataFrame and dictionary from the session state to the pickle file
1015
- if st.button("Accept and Save", use_container_width=True):
1016
- save_to_pickle(
1017
- "data_import.pkl", st.session_state["final_df"], st.session_state["bin_dict"]
1018
- )
1019
- st.toast("💾 Saved Successfully!")