BlendMMM commited on
Commit
7275079
·
verified ·
1 Parent(s): afc7982

Delete pages/2_Transformations.py

Browse files
Files changed (1) hide show
  1. pages/2_Transformations.py +0 -522
pages/2_Transformations.py DELETED
@@ -1,522 +0,0 @@
1
- # Importing necessary libraries
2
- import streamlit as st
3
-
4
- st.set_page_config(
5
- page_title="Transformations",
6
- page_icon=":shark:",
7
- layout="wide",
8
- initial_sidebar_state="collapsed",
9
- )
10
-
11
- import pickle
12
- import numpy as np
13
- import pandas as pd
14
- from utilities import set_header, load_local_css
15
- import streamlit_authenticator as stauth
16
- import yaml
17
- from yaml import SafeLoader
18
-
19
- load_local_css("styles.css")
20
- set_header()
21
-
22
- # Check for authentication status
23
- for k, v in st.session_state.items():
24
- if k not in ["logout", "login", "config"] and not k.startswith(
25
- "FormSubmitter"
26
- ):
27
- st.session_state[k] = v
28
- with open("config.yaml") as file:
29
- config = yaml.load(file, Loader=SafeLoader)
30
- st.session_state["config"] = config
31
- authenticator = stauth.Authenticate(
32
- config["credentials"],
33
- config["cookie"]["name"],
34
- config["cookie"]["key"],
35
- config["cookie"]["expiry_days"],
36
- config["preauthorized"],
37
- )
38
- st.session_state["authenticator"] = authenticator
39
- name, authentication_status, username = authenticator.login("Login", "main")
40
- auth_status = st.session_state.get("authentication_status")
41
-
42
- if auth_status == True:
43
- authenticator.logout("Logout", "main")
44
- is_state_initiaized = st.session_state.get("initialized", False)
45
-
46
- if not is_state_initiaized:
47
-
48
- if 'session_name' not in st.session_state:
49
- st.session_state['session_name']=None
50
-
51
-
52
- # Deserialize and load the objects from the pickle file
53
- with open("data_import.pkl", "rb") as f:
54
- data = pickle.load(f)
55
-
56
- # Accessing the loaded objects
57
- final_df_loaded = data["final_df"]
58
- bin_dict_loaded = data["bin_dict"]
59
-
60
- # Initialize session state
61
- if "transformed_columns_dict" not in st.session_state:
62
- st.session_state["transformed_columns_dict"] = {} # Default empty dictionary
63
-
64
- if "final_df" not in st.session_state:
65
- st.session_state["final_df"] = final_df_loaded # Default as original dataframe
66
-
67
- if "summary_string" not in st.session_state:
68
- st.session_state["summary_string"] = None # Default as None
69
-
70
- # Extract original columns for specified categories
71
- original_columns = {
72
- category: bin_dict_loaded[category]
73
- for category in ["Media", "Internal", "Exogenous"]
74
- if category in bin_dict_loaded
75
- }
76
-
77
- # Retrive Panel columns
78
- panel_1 = bin_dict_loaded.get("Panel Level 1")
79
- panel_2 = bin_dict_loaded.get("Panel Level 2")
80
-
81
- # # For testing on non panel level
82
- # final_df_loaded = final_df_loaded.drop("Panel_1", axis=1)
83
- # final_df_loaded = final_df_loaded.groupby("date").mean().reset_index()
84
- # panel_1 = None
85
-
86
- # Apply transformations on panel level
87
- st.write("")
88
- if panel_1:
89
- panel = panel_1 + panel_2 if panel_2 else panel_1
90
- else:
91
- panel = []
92
-
93
-
94
- # Function to build transformation widgets
95
- def transformation_widgets(category, transform_params, date_granularity):
96
- # Transformation Options
97
- transformation_options = {
98
- "Media": ["Lag", "Moving Average", "Saturation", "Power", "Adstock"],
99
- "Internal": ["Lead", "Lag", "Moving Average"],
100
- "Exogenous": ["Lead", "Lag", "Moving Average"],
101
- }
102
-
103
- with st.expander(f"{category} Transformations"):
104
-
105
- # Let users select which transformations to apply
106
- transformations_to_apply = st.multiselect(
107
- "Select transformations to apply",
108
- options=transformation_options[category],
109
- default=[],
110
- key=f"transformation_{category}",
111
- )
112
-
113
- # Determine the number of transformations to put in each column
114
- transformations_per_column = (
115
- len(transformations_to_apply) // 2 + len(transformations_to_apply) % 2
116
- )
117
-
118
- # Create two columns
119
- col1, col2 = st.columns(2)
120
-
121
- # Assign transformations to each column
122
- transformations_col1 = transformations_to_apply[:transformations_per_column]
123
- transformations_col2 = transformations_to_apply[transformations_per_column:]
124
-
125
- # Define a helper function to create widgets for each transformation
126
- def create_transformation_widgets(column, transformations):
127
- with column:
128
- for transformation in transformations:
129
- # Conditionally create widgets for selected transformations
130
- if transformation == "Lead":
131
- st.markdown(f"**Lead ({date_granularity})**")
132
- lead = st.slider(
133
- "Lead periods",
134
- 1,
135
- 10,
136
- (1, 2),
137
- 1,
138
- key=f"lead_{category}",
139
- label_visibility="collapsed",
140
- )
141
- start = lead[0]
142
- end = lead[1]
143
- step = 1
144
- transform_params[category]["Lead"] = np.arange(
145
- start, end + step, step
146
- )
147
-
148
- if transformation == "Lag":
149
- st.markdown(f"**Lag ({date_granularity})**")
150
- lag = st.slider(
151
- "Lag periods",
152
- 1,
153
- 10,
154
- (1, 2),
155
- 1,
156
- key=f"lag_{category}",
157
- label_visibility="collapsed",
158
- )
159
- start = lag[0]
160
- end = lag[1]
161
- step = 1
162
- transform_params[category]["Lag"] = np.arange(
163
- start, end + step, step
164
- )
165
-
166
- if transformation == "Moving Average":
167
- st.markdown(f"**Moving Average ({date_granularity})**")
168
- window = st.slider(
169
- "Window size for Moving Average",
170
- 1,
171
- 10,
172
- (1, 2),
173
- 1,
174
- key=f"ma_{category}",
175
- label_visibility="collapsed",
176
- )
177
- start = window[0]
178
- end = window[1]
179
- step = 1
180
- transform_params[category]["Moving Average"] = np.arange(
181
- start, end + step, step
182
- )
183
-
184
- if transformation == "Saturation":
185
- st.markdown("**Saturation (%)**")
186
- saturation_point = st.slider(
187
- f"Saturation Percentage",
188
- 0,
189
- 100,
190
- (10, 20),
191
- 10,
192
- key=f"sat_{category}",
193
- label_visibility="collapsed",
194
- )
195
- start = saturation_point[0]
196
- end = saturation_point[1]
197
- step = 10
198
- transform_params[category]["Saturation"] = np.arange(
199
- start, end + step, step
200
- )
201
-
202
- if transformation == "Power":
203
- st.markdown("**Power**")
204
- power = st.slider(
205
- f"Power",
206
- 0,
207
- 10,
208
- (2, 4),
209
- 1,
210
- key=f"power_{category}",
211
- label_visibility="collapsed",
212
- )
213
- start = power[0]
214
- end = power[1]
215
- step = 1
216
- transform_params[category]["Power"] = np.arange(
217
- start, end + step, step
218
- )
219
-
220
- if transformation == "Adstock":
221
- st.markdown("**Adstock**")
222
- rate = st.slider(
223
- f"Factor ({category})",
224
- 0.0,
225
- 1.0,
226
- (0.5, 0.7),
227
- 0.05,
228
- key=f"adstock_{category}",
229
- label_visibility="collapsed",
230
- )
231
- start = rate[0]
232
- end = rate[1]
233
- step = 0.05
234
- adstock_range = [
235
- round(a, 3) for a in np.arange(start, end + step, step)
236
- ]
237
- transform_params[category]["Adstock"] = adstock_range
238
-
239
- # Create widgets in each column
240
- create_transformation_widgets(col1, transformations_col1)
241
- create_transformation_widgets(col2, transformations_col2)
242
-
243
-
244
- # Function to apply Lag transformation
245
- def apply_lag(df, lag):
246
- return df.shift(lag)
247
-
248
-
249
- # Function to apply Lead transformation
250
- def apply_lead(df, lead):
251
- return df.shift(-lead)
252
-
253
-
254
- # Function to apply Moving Average transformation
255
- def apply_moving_average(df, window_size):
256
- return df.rolling(window=window_size).mean()
257
-
258
-
259
- # Function to apply Saturation transformation
260
- def apply_saturation(df, saturation_percent_100):
261
- # Convert saturation percentage from 100-based to fraction
262
- saturation_percent = saturation_percent_100 / 100.0
263
-
264
- # Calculate saturation point and steepness
265
- column_max = df.max()
266
- column_min = df.min()
267
- saturation_point = (column_min + column_max) / 2
268
-
269
- numerator = np.log(
270
- (1 / (saturation_percent if saturation_percent != 1 else 1 - 1e-9)) - 1
271
- )
272
- denominator = np.log(saturation_point / max(column_max, 1e-9))
273
-
274
- steepness = numerator / max(
275
- denominator, 1e-9
276
- ) # Avoid division by zero with a small constant
277
-
278
- # Apply the saturation transformation
279
- transformed_series = df.apply(
280
- lambda x: (1 / (1 + (saturation_point / x) ** steepness)) * x
281
- )
282
-
283
- return transformed_series
284
-
285
-
286
- # Function to apply Power transformation
287
- def apply_power(df, power):
288
- return df**power
289
-
290
-
291
- # Function to apply Adstock transformation
292
- def apply_adstock(df, factor):
293
- x = 0
294
- # Use the walrus operator to update x iteratively with the Adstock formula
295
- adstock_var = [x := x * factor + v for v in df]
296
- ans = pd.Series(adstock_var, index=df.index)
297
- return ans
298
-
299
-
300
- # Function to generate transformed columns names
301
- @st.cache_resource(show_spinner=False)
302
- def generate_transformed_columns(original_columns, transform_params):
303
- transformed_columns, summary = {}, {}
304
-
305
- for category, columns in original_columns.items():
306
- for column in columns:
307
- transformed_columns[column] = []
308
- summary_details = (
309
- []
310
- ) # List to hold transformation details for the current column
311
-
312
- if category in transform_params:
313
- for transformation, values in transform_params[category].items():
314
- # Generate transformed column names for each value
315
- for value in values:
316
- transformed_name = f"{column}@{transformation}_{value}"
317
- transformed_columns[column].append(transformed_name)
318
-
319
- # Format the values list as a string with commas and "and" before the last item
320
- if len(values) > 1:
321
- formatted_values = (
322
- ", ".join(map(str, values[:-1])) + " and " + str(values[-1])
323
- )
324
- else:
325
- formatted_values = str(values[0])
326
-
327
- # Add transformation details
328
- summary_details.append(f"{transformation} ({formatted_values})")
329
-
330
- # Only add to summary if there are transformation details for the column
331
- if summary_details:
332
- formatted_summary = "⮕ ".join(summary_details)
333
- # Use <strong> tags to make the column name bold
334
- summary[column] = f"<strong>{column}</strong>: {formatted_summary}"
335
-
336
- # Generate a comprehensive summary string for all columns
337
- summary_items = [
338
- f"{idx + 1}. {details}" for idx, details in enumerate(summary.values())
339
- ]
340
-
341
- summary_string = "\n".join(summary_items)
342
-
343
- return transformed_columns, summary_string
344
-
345
-
346
- # Function to apply transformations to DataFrame slices based on specified categories and parameters
347
- @st.cache_resource(show_spinner=False)
348
- def apply_category_transformations(df, bin_dict, transform_params, panel):
349
- # Dictionary for function mapping
350
- transformation_functions = {
351
- "Lead": apply_lead,
352
- "Lag": apply_lag,
353
- "Moving Average": apply_moving_average,
354
- "Saturation": apply_saturation,
355
- "Power": apply_power,
356
- "Adstock": apply_adstock,
357
- }
358
-
359
- # Initialize category_df as an empty DataFrame
360
- category_df = pd.DataFrame()
361
-
362
- # Iterate through each category specified in transform_params
363
- for category in ["Media", "Internal", "Exogenous"]:
364
- if (
365
- category not in transform_params
366
- or category not in bin_dict
367
- or not transform_params[category]
368
- ):
369
- continue # Skip categories without transformations
370
-
371
- # Slice the DataFrame based on the columns specified in bin_dict for the current category
372
- df_slice = df[bin_dict[category] + panel]
373
-
374
- # Iterate through each transformation and its parameters for the current category
375
- for transformation, parameters in transform_params[category].items():
376
- transformation_function = transformation_functions[transformation]
377
-
378
- # Check if there is panel data to group by
379
- if len(panel) > 0:
380
- # Apply the transformation to each group
381
- category_df = pd.concat(
382
- [
383
- df_slice.groupby(panel)
384
- .transform(transformation_function, p)
385
- .add_suffix(f"@{transformation}_{p}")
386
- for p in parameters
387
- ],
388
- axis=1,
389
- )
390
-
391
- # Replace all NaN or null values in category_df with 0
392
- category_df.fillna(0, inplace=True)
393
-
394
- # Update df_slice
395
- df_slice = pd.concat(
396
- [df[panel], category_df],
397
- axis=1,
398
- )
399
-
400
- else:
401
- for p in parameters:
402
- # Apply the transformation function to each column
403
- temp_df = df_slice.apply(
404
- lambda x: transformation_function(x, p), axis=0
405
- ).rename(lambda x: f"{x}@{transformation}_{p}", axis="columns")
406
- # Concatenate the transformed DataFrame slice to the category DataFrame
407
- category_df = pd.concat([category_df, temp_df], axis=1)
408
-
409
- # Replace all NaN or null values in category_df with 0
410
- category_df.fillna(0, inplace=True)
411
-
412
- # Update df_slice
413
- df_slice = pd.concat(
414
- [df[panel], category_df],
415
- axis=1,
416
- )
417
-
418
- # If category_df has been modified, concatenate it with the panel and response metrics from the original DataFrame
419
- if not category_df.empty:
420
- final_df = pd.concat([df, category_df], axis=1)
421
- else:
422
- # If no transformations were applied, use the original DataFrame
423
- final_df = df
424
-
425
- return final_df
426
-
427
-
428
- # Function to infers the granularity of the date column in a DataFrame
429
- @st.cache_resource(show_spinner=False)
430
- def infer_date_granularity(df):
431
- # Find the most common difference
432
- common_freq = pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
433
-
434
- # Map the most common difference to a granularity
435
- if common_freq == 1:
436
- return "daily"
437
- elif common_freq == 7:
438
- return "weekly"
439
- elif 28 <= common_freq <= 31:
440
- return "monthly"
441
- else:
442
- return "irregular"
443
-
444
-
445
- #########################################################################################################################################################
446
- # User input for transformations
447
- #########################################################################################################################################################
448
-
449
-
450
- # Infer date granularity
451
- date_granularity = infer_date_granularity(final_df_loaded)
452
-
453
- # Initialize the main dictionary to store the transformation parameters for each category
454
- transform_params = {"Media": {}, "Internal": {}, "Exogenous": {}}
455
-
456
- # User input for transformations
457
- st.markdown("### Select Transformations to Apply")
458
- for category in ["Media", "Internal", "Exogenous"]:
459
- # Skip Internal
460
- if category == "Internal":
461
- continue
462
-
463
- transformation_widgets(category, transform_params, date_granularity)
464
-
465
-
466
- #########################################################################################################################################################
467
- # Apply transformations
468
- #########################################################################################################################################################
469
-
470
-
471
- # Apply category-based transformations to the DataFrame
472
- if st.button("Accept and Proceed", use_container_width=True):
473
- with st.spinner("Applying transformations..."):
474
- final_df = apply_category_transformations(
475
- final_df_loaded, bin_dict_loaded, transform_params, panel
476
- )
477
-
478
- # Generate a dictionary mapping original column names to lists of transformed column names
479
- transformed_columns_dict, summary_string = generate_transformed_columns(
480
- original_columns, transform_params
481
- )
482
-
483
- # Store into transformed dataframe and summary session state
484
- st.session_state["final_df"] = final_df
485
- st.session_state["summary_string"] = summary_string
486
-
487
-
488
- #########################################################################################################################################################
489
- # Display the transformed DataFrame and summary
490
- #########################################################################################################################################################
491
-
492
-
493
- # Display the transformed DataFrame in the Streamlit app
494
- st.markdown("### Transformed DataFrame")
495
- st.dataframe(st.session_state["final_df"], hide_index=True)
496
-
497
- # Total rows and columns
498
- total_rows, total_columns = st.session_state["final_df"].shape
499
- st.markdown(
500
- f"<p style='text-align: justify;'>The transformed DataFrame contains <strong>{total_rows}</strong> rows and <strong>{total_columns}</strong> columns.</p>",
501
- unsafe_allow_html=True,
502
- )
503
-
504
- # Display the summary of transformations as markdown
505
- if st.session_state["summary_string"]:
506
- with st.expander("Summary of Transformations"):
507
- st.markdown("### Summary of Transformations")
508
- st.markdown(st.session_state["summary_string"], unsafe_allow_html=True)
509
-
510
- @st.cache_resource(show_spinner=False)
511
- def save_to_pickle(file_path, final_df):
512
- # Open the file in write-binary mode and dump the objects
513
- with open(file_path, "wb") as f:
514
- pickle.dump({"final_df_transformed": final_df}, f)
515
- # Data is now saved to file
516
-
517
- if st.button("Accept and Save", use_container_width=True):
518
-
519
- save_to_pickle(
520
- "final_df_transformed.pkl", st.session_state["final_df"]
521
- )
522
- st.toast("💾 Saved Successfully!")