Spaces:

BlendMMM
/

Mastercard

Sleeping

App Files Files Community

BlendMMM commited on Apr 8, 2024

Commit

4043b0d

verified ·

1 Parent(s): 646c087

Delete Data_Import.py

Browse files

Files changed (1) hide show

Data_Import.py +0 -851

Data_Import.py DELETED Viewed

@@ -1,851 +0,0 @@
-# Importing necessary libraries
-import streamlit as st
-st.set_page_config(
-    page_title="Model Build",
-    page_icon=":shark:",
-    layout="wide",
-    initial_sidebar_state="collapsed",
-)
-import pickle
-import numpy as np
-import pandas as pd
-from utilities import set_header, load_local_css, load_authenticator
-load_local_css("styles.css")
-set_header()
-authenticator = st.session_state.get("authenticator")
-if authenticator is None:
-    authenticator = load_authenticator()
-name, authentication_status, username = authenticator.login("Login", "main")
-auth_status = st.session_state.get("authentication_status")
-# Check for authentication status
-if auth_status != True:
-    st.stop()
-# Function to validate date column in dataframe
-def validate_date_column(df):
-    try:
-        # Attempt to convert the 'Date' column to datetime
-        df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
-        return True
-    except:
-        return False
-# Function to determine data interval
-def determine_data_interval(common_freq):
-    if common_freq == 1:
-        return "daily"
-    elif common_freq == 7:
-        return "weekly"
-    elif 28 <= common_freq <= 31:
-        return "monthly"
-    else:
-        return "irregular"
-# Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
-st.cache_resource(show_spinner=False)
-def files_to_dataframes(uploaded_files):
-    df_dict = {}
-    for uploaded_file in uploaded_files:
-        # Extract file name without extension
-        file_name = uploaded_file.name.rsplit(".", 1)[0]
-        # Check for duplicate file names
-        if file_name in df_dict:
-            st.warning(
-                f"Duplicate File: {file_name}. This file will be skipped.",
-                icon="⚠️",
-            )
-            continue
-        # Read the file into a DataFrame
-        df = pd.read_excel(uploaded_file)
-        # Convert all column names to lowercase
-        df.columns = df.columns.str.lower().str.strip()
-        # Separate numeric and non-numeric columns
-        numeric_cols = list(df.select_dtypes(include=["number"]).columns)
-        non_numeric_cols = [
-            col
-            for col in df.select_dtypes(exclude=["number"]).columns
-            if col.lower() != "date"
-        ]
-        # Check for 'Date' column
-        if not (validate_date_column(df) and len(numeric_cols) > 0):
-            st.warning(
-                f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
-                icon="⚠️",
-            )
-            continue
-        # Check for interval
-        common_freq = common_freq = (
-            pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
-        )
-        # Calculate the data interval (daily, weekly, monthly or irregular)
-        interval = determine_data_interval(common_freq)
-        if interval == "irregular":
-            st.warning(
-                f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
-                icon="⚠️",
-            )
-            continue
-        # Store both DataFrames in the dictionary under their respective keys
-        df_dict[file_name] = {
-            "numeric": numeric_cols,
-            "non_numeric": non_numeric_cols,
-            "interval": interval,
-            "df": df,
-        }
-    return df_dict
-# Function to adjust dataframe granularity
-def adjust_dataframe_granularity(df, current_granularity, target_granularity):
-    # Set index
-    df.set_index("date", inplace=True)
-    # Define aggregation rules for resampling
-    aggregation_rules = {
-        col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
-        for col in df.columns
-    }
-    # Initialize resampled_df
-    resampled_df = df
-    if current_granularity == "daily" and target_granularity == "weekly":
-        resampled_df = df.resample("W-MON", closed="left", label="left").agg(
-            aggregation_rules
-        )
-    elif current_granularity == "daily" and target_granularity == "monthly":
-        resampled_df = df.resample("MS", closed="left", label="left").agg(
-            aggregation_rules
-        )
-    elif current_granularity == "daily" and target_granularity == "daily":
-        resampled_df = df.resample("D").agg(aggregation_rules)
-    elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
-        # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
-        expanded_data = []
-        for _, row in df.iterrows():
-            if current_granularity == "weekly":
-                period_range = pd.date_range(start=row.name, periods=7)
-            elif current_granularity == "monthly":
-                period_range = pd.date_range(
-                    start=row.name, periods=row.name.days_in_month
-                )
-            for date in period_range:
-                new_row = {}
-                for col in df.columns:
-                    if pd.api.types.is_numeric_dtype(df[col]):
-                        if current_granularity == "weekly":
-                            new_row[col] = row[col] / 7
-                        elif current_granularity == "monthly":
-                            new_row[col] = row[col] / row.name.days_in_month
-                    else:
-                        new_row[col] = row[col]
-                expanded_data.append((date, new_row))
-        resampled_df = pd.DataFrame(
-            [data for _, data in expanded_data],
-            index=[date for date, _ in expanded_data],
-        )
-    # Reset index
-    resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
-    return resampled_df
-# Function to clean and extract unique values of Panel_1 and Panel_2
-st.cache_resource(show_spinner=False)
-def clean_and_extract_unique_values(files_dict, selections):
-    all_panel1_values = set()
-    all_panel2_values = set()
-    for file_name, file_data in files_dict.items():
-        df = file_data["df"]
-        # 'Panel_1' and 'Panel_2' selections
-        selected_panel1 = selections[file_name].get("Panel_1")
-        selected_panel2 = selections[file_name].get("Panel_2")
-        # Clean and standardize Panel_1 column if it exists and is selected
-        if (
-            selected_panel1
-            and selected_panel1 != "N/A"
-            and selected_panel1 in df.columns
-        ):
-            df[selected_panel1] = (
-                df[selected_panel1].str.lower().str.strip().str.replace("_", " ")
-            )
-            all_panel1_values.update(df[selected_panel1].dropna().unique())
-        # Clean and standardize Panel_2 column if it exists and is selected
-        if (
-            selected_panel2
-            and selected_panel2 != "N/A"
-            and selected_panel2 in df.columns
-        ):
-            df[selected_panel2] = (
-                df[selected_panel2].str.lower().str.strip().str.replace("_", " ")
-            )
-            all_panel2_values.update(df[selected_panel2].dropna().unique())
-        # Update the processed DataFrame back in the dictionary
-        files_dict[file_name]["df"] = df
-    return all_panel1_values, all_panel2_values
-# Function to format values for display
-st.cache_resource(show_spinner=False)
-def format_values_for_display(values_list):
-    # Capitalize the first letter of each word and replace underscores with spaces
-    formatted_list = [value.replace("_", " ").title() for value in values_list]
-    # Join values with commas and 'and' before the last value
-    if len(formatted_list) > 1:
-        return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
-    elif formatted_list:
-        return formatted_list[0]
-    return "No values available"
-# Function to normalizes all data within files_dict to a daily granularity
-st.cache(show_spinner=False, allow_output_mutation=True)
-def standardize_data_to_daily(files_dict, selections):
-    # Normalize all data to a daily granularity using a provided function
-    files_dict = apply_granularity_to_all(files_dict, "daily", selections)
-    # Update the "interval" attribute for each dataset to indicate the new granularity
-    for files_name, files_data in files_dict.items():
-        files_data["interval"] = "daily"
-    return files_dict
-# Function to apply granularity transformation to all DataFrames in files_dict
-st.cache_resource(show_spinner=False)
-def apply_granularity_to_all(files_dict, granularity_selection, selections):
-    for file_name, file_data in files_dict.items():
-        df = file_data["df"].copy()
-        # Handling when Panel_1 or Panel_2 might be 'N/A'
-        selected_panel1 = selections[file_name].get("Panel_1")
-        selected_panel2 = selections[file_name].get("Panel_2")
-        # Correcting the segment selection logic & handling 'N/A'
-        if selected_panel1 != "N/A" and selected_panel2 != "N/A":
-            unique_combinations = df[
-                [selected_panel1, selected_panel2]
-            ].drop_duplicates()
-        elif selected_panel1 != "N/A":
-            unique_combinations = df[[selected_panel1]].drop_duplicates()
-            selected_panel2 = None  # Ensure Panel_2 is ignored if N/A
-        elif selected_panel2 != "N/A":
-            unique_combinations = df[[selected_panel2]].drop_duplicates()
-            selected_panel1 = None  # Ensure Panel_1 is ignored if N/A
-        else:
-            # If both are 'N/A', process the entire dataframe as is
-            df = adjust_dataframe_granularity(
-                df, file_data["interval"], granularity_selection
-            )
-            files_dict[file_name]["df"] = df
-            continue  # Skip to the next file
-        transformed_segments = []
-        for _, combo in unique_combinations.iterrows():
-            if selected_panel1 and selected_panel2:
-                segment = df[
-                    (df[selected_panel1] == combo[selected_panel1])
-                    & (df[selected_panel2] == combo[selected_panel2])
-                ]
-            elif selected_panel1:
-                segment = df[df[selected_panel1] == combo[selected_panel1]]
-            elif selected_panel2:
-                segment = df[df[selected_panel2] == combo[selected_panel2]]
-            # Adjust granularity of the segment
-            transformed_segment = adjust_dataframe_granularity(
-                segment, file_data["interval"], granularity_selection
-            )
-            transformed_segments.append(transformed_segment)
-        # Combine all transformed segments into a single DataFrame for this file
-        transformed_df = pd.concat(transformed_segments, ignore_index=True)
-        files_dict[file_name]["df"] = transformed_df
-    return files_dict
-# Function to create main dataframe structure
-st.cache_resource(show_spinner=False)
-def create_main_dataframe(
-    files_dict, all_panel1_values, all_panel2_values, granularity_selection
-):
-    # Determine the global start and end dates across all DataFrames
-    global_start = min(df["df"]["date"].min() for df in files_dict.values())
-    global_end = max(df["df"]["date"].max() for df in files_dict.values())
-    # Adjust the date_range generation based on the granularity_selection
-    if granularity_selection == "weekly":
-        # Generate a weekly range, with weeks starting on Monday
-        date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
-    elif granularity_selection == "monthly":
-        # Generate a monthly range, starting from the first day of each month
-        date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
-    else:  # Default to daily if not weekly or monthly
-        date_range = pd.date_range(start=global_start, end=global_end, freq="D")
-    # Collect all unique Panel_1 and Panel_2 values, excluding 'N/A'
-    all_panel1s = all_panel1_values
-    all_panel2s = all_panel2_values
-    # Dynamically build the list of dimensions (Panel_1, Panel_2) to include in the main DataFrame based on availability
-    dimensions, merge_keys = [], []
-    if all_panel1s:
-        dimensions.append(all_panel1s)
-        merge_keys.append("Panel_1")
-    if all_panel2s:
-        dimensions.append(all_panel2s)
-        merge_keys.append("Panel_2")
-    dimensions.append(date_range)  # Date range is always included
-    merge_keys.append("date")  # Date range is always included
-    # Create a main DataFrame template with the dimensions
-    main_df = pd.MultiIndex.from_product(
-        dimensions,
-        names=[name for name, _ in zip(merge_keys, dimensions)],
-    ).to_frame(index=False)
-    return main_df.reset_index(drop=True)
-# Function to prepare and merge dataFrames
-st.cache_resource(show_spinner=False)
-def merge_into_main_df(main_df, files_dict, selections):
-    for file_name, file_data in files_dict.items():
-        df = file_data["df"].copy()
-        # Rename selected Panel_1 and Panel_2 columns if not 'N/A'
-        selected_panel1 = selections[file_name].get("Panel_1", "N/A")
-        selected_panel2 = selections[file_name].get("Panel_2", "N/A")
-        if selected_panel1 != "N/A":
-            df.rename(columns={selected_panel1: "Panel_1"}, inplace=True)
-        if selected_panel2 != "N/A":
-            df.rename(columns={selected_panel2: "Panel_2"}, inplace=True)
-        # Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel_1' and 'Panel_2'
-        merge_keys = ["date"]
-        if "Panel_1" in df.columns:
-            merge_keys.append("Panel_1")
-        if "Panel_2" in df.columns:
-            merge_keys.append("Panel_2")
-        main_df = pd.merge(main_df, df, on=merge_keys, how="left")
-    # After all merges, sort by 'date' and reset index for cleanliness
-    sort_by = ["date"]
-    if "Panel_1" in main_df.columns:
-        sort_by.append("Panel_1")
-    if "Panel_2" in main_df.columns:
-        sort_by.append("Panel_2")
-    main_df.sort_values(by=sort_by, inplace=True)
-    main_df.reset_index(drop=True, inplace=True)
-    return main_df
-# Function to categorize column
-def categorize_column(column_name):
-    # Define keywords for each category
-    internal_keywords = [
-        "Price",
-        "Discount",
-        "product_price",
-        "cost",
-        "margin",
-        "inventory",
-        "sales",
-        "revenue",
-        "turnover",
-        "expense",
-    ]
-    exogenous_keywords = [
-        "GDP",
-        "Tax",
-        "Inflation",
-        "interest_rate",
-        "employment_rate",
-        "exchange_rate",
-        "consumer_spending",
-        "retail_sales",
-        "oil_prices",
-        "weather",
-    ]
-    # Check if the column name matches any of the keywords for Internal or Exogenous categories
-    for keyword in internal_keywords:
-        if keyword.lower() in column_name.lower():
-            return "Internal"
-    for keyword in exogenous_keywords:
-        if keyword.lower() in column_name.lower():
-            return "Exogenous"
-    # Default to Media if no match found
-    return "Media"
-# Function to calculate missing stats and prepare for editable DataFrame
-st.cache_resource(show_spinner=False)
-def prepare_missing_stats_df(df):
-    missing_stats = []
-    for column in df.columns:
-        if (
-            column == "date" or column == "Panel_2" or column == "Panel_1"
-        ):  # Skip Date, Panel_1 and Panel_2 column
-            continue
-        missing = df[column].isnull().sum()
-        pct_missing = round((missing / len(df)) * 100, 2)
-        # Dynamically assign category based on column name
-        category = categorize_column(column)
-        # category = "Media"  # Keep default bin as Media
-        missing_stats.append(
-            {
-                "Column": column,
-                "Missing Values": missing,
-                "Missing Percentage": pct_missing,
-                "Impute Method": "Fill with 0",  # Default value
-                "Category": category,
-            }
-        )
-    stats_df = pd.DataFrame(missing_stats)
-    return stats_df
-# Function to add API DataFrame details to the files dictionary
-st.cache_resource(show_spinner=False)
-def add_api_dataframe_to_dict(main_df, files_dict):
-    files_dict["API"] = {
-        "numeric": list(main_df.select_dtypes(include=["number"]).columns),
-        "non_numeric": [
-            col
-            for col in main_df.select_dtypes(exclude=["number"]).columns
-            if col.lower() != "date"
-        ],
-        "interval": determine_data_interval(
-            pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
-        ),
-        "df": main_df,
-    }
-    return files_dict
-# Function to reads an API into a DataFrame, parsing specified columns as datetime
-@st.cache_resource(show_spinner=False)
-def read_API_data():
-    return pd.read_excel("upf_data_converted.xlsx", parse_dates=["Date"])
-# Function to set the 'Panel_1_Panel_2_Selected' session state variable to False
-def set_Panel_1_Panel_2_Selected_false():
-    st.session_state["Panel_1_Panel_2_Selected"] = False
-# Function to serialize and save the objects into a pickle file
-@st.cache_resource(show_spinner=False)
-def save_to_pickle(file_path, final_df, bin_dict):
-    # Open the file in write-binary mode and dump the objects
-    with open(file_path, "wb") as f:
-        pickle.dump({"final_df": final_df, "bin_dict": bin_dict}, f)
-        # Data is now saved to file
-# Initialize 'final_df' in session state
-if "final_df" not in st.session_state:
-    st.session_state["final_df"] = pd.DataFrame()
-# Initialize 'bin_dict' in session state
-if "bin_dict" not in st.session_state:
-    st.session_state["bin_dict"] = {}
-# Initialize 'Panel_1_Panel_2_Selected' in session state
-if "Panel_1_Panel_2_Selected" not in st.session_state:
-    st.session_state["Panel_1_Panel_2_Selected"] = False
-# Page Title
-st.write("")  # Top padding
-st.title("Data Import")
-#########################################################################################################################################################
-# Create a dictionary to hold all DataFrames and collect user input to specify "Panel_2" and "Panel_1" columns for each file
-#########################################################################################################################################################
-# Read the Excel file, parsing 'Date' column as datetime
-main_df = read_API_data()
-# Convert all column names to lowercase
-main_df.columns = main_df.columns.str.lower().str.strip()
-# File uploader
-uploaded_files = st.file_uploader(
-    "Upload additional data",
-    type=["xlsx"],
-    accept_multiple_files=True,
-    on_change=set_Panel_1_Panel_2_Selected_false,
-)
-# Custom HTML for upload instructions
-recommendation_html = f"""
-<div style="text-align: justify;">
-<strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
-</div>
-"""
-st.markdown(recommendation_html, unsafe_allow_html=True)
-# Choose Desired Granularity
-st.markdown("#### Choose Desired Granularity")
-# Granularity Selection
-granularity_selection = st.selectbox(
-    "Choose Date Granularity",
-    ["Daily", "Weekly", "Monthly"],
-    label_visibility="collapsed",
-    on_change=set_Panel_1_Panel_2_Selected_false,
-)
-granularity_selection = str(granularity_selection).lower()
-# Convert files to dataframes
-files_dict = files_to_dataframes(uploaded_files)
-# Add API Dataframe
-if main_df is not None:
-    files_dict = add_api_dataframe_to_dict(main_df, files_dict)
-# Display a warning message if no files have been uploaded and halt further execution
-if not files_dict:
-    st.warning(
-        "Please upload at least one file to proceed.",
-        icon="⚠️",
-    )
-    st.stop()  # Halts further execution until file is uploaded
-# Select Panel_1 and Panel_2 columns
-st.markdown("#### Select Panel columns")
-selections = {}
-with st.expander("Select Panel columns", expanded=False):
-    count = 0  # Initialize counter to manage the visibility of labels and keys
-    for file_name, file_data in files_dict.items():
-        # Determine visibility of the label based on the count
-        if count == 0:
-            label_visibility = "visible"
-        else:
-            label_visibility = "collapsed"
-        # Extract non-numeric columns
-        non_numeric_cols = file_data["non_numeric"]
-        # Prepare Panel_1 and Panel_2 values for dropdown, adding "N/A" as an option
-        panel1_values = non_numeric_cols + ["N/A"]
-        panel2_values = non_numeric_cols + ["N/A"]
-        # Skip if only one option is available
-        if len(panel1_values) == 1 and len(panel2_values) == 1:
-            selected_panel1, selected_panel2 = "N/A", "N/A"
-            # Update the selections for Panel_1 and Panel_2 for the current file
-            selections[file_name] = {
-                "Panel_1": selected_panel1,
-                "Panel_2": selected_panel2,
-            }
-            continue
-        # Create layout columns for File Name, Panel_2, and Panel_1 selections
-        file_name_col, Panel_1_col, Panel_2_col = st.columns([2, 4, 4])
-        with file_name_col:
-            # Display "File Name" label only for the first file
-            if count == 0:
-                st.write("File Name")
-            else:
-                st.write("")
-            st.write(file_name)  # Display the file name
-        with Panel_1_col:
-            # Display a selectbox for Panel_1 values
-            selected_panel1 = st.selectbox(
-                "Select Panel Level 1",
-                panel2_values,
-                on_change=set_Panel_1_Panel_2_Selected_false,
-                label_visibility=label_visibility,  # Control visibility of the label
-                key=f"Panel_1_selectbox{count}",  # Ensure unique key for each selectbox
-            )
-        with Panel_2_col:
-            # Display a selectbox for Panel_2 values
-            selected_panel2 = st.selectbox(
-                "Select Panel Level 2",
-                panel1_values,
-                on_change=set_Panel_1_Panel_2_Selected_false,
-                label_visibility=label_visibility,  # Control visibility of the label
-                key=f"Panel_2_selectbox{count}",  # Ensure unique key for each selectbox
-            )
-        # Skip processing if the same column is selected for both Panel_1 and Panel_2 due to potential data integrity issues
-        if selected_panel2 == selected_panel1 and not (
-            selected_panel2 == "N/A" and selected_panel1 == "N/A"
-        ):
-            st.warning(
-                f"File: {file_name} → The same column cannot serve as both Panel_1 and Panel_2. Please adjust your selections.",
-            )
-            selected_panel1, selected_panel2 = "N/A", "N/A"
-            st.stop()
-        # Update the selections for Panel_1 and Panel_2 for the current file
-        selections[file_name] = {
-            "Panel_1": selected_panel1,
-            "Panel_2": selected_panel2,
-        }
-        count += 1  # Increment the counter after processing each file
-    # Accept Panel_1 and Panel_2 selection
-    if st.button("Accept and Process", use_container_width=True):
-        # Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
-        with st.spinner("Processing..."):
-            files_dict = standardize_data_to_daily(files_dict, selections)
-            # Convert all data to daily level granularity
-            files_dict = apply_granularity_to_all(
-                files_dict, granularity_selection, selections
-            )
-        st.session_state["files_dict"] = files_dict
-        st.session_state["Panel_1_Panel_2_Selected"] = True
-#########################################################################################################################################################
-# Display unique Panel_1 and Panel_2 values
-#########################################################################################################################################################
-# Halts further execution until Panel_1 and Panel_2 columns are selected
-if "files_dict" in st.session_state and st.session_state["Panel_1_Panel_2_Selected"]:
-    files_dict = st.session_state["files_dict"]
-else:
-    st.stop()
-# Set to store unique values of Panel_1 and Panel_2
-with st.spinner("Fetching Panel values..."):
-    all_panel1_values, all_panel2_values = clean_and_extract_unique_values(
-        files_dict, selections
-    )
-    # List of Panel_1 and Panel_2 columns unique values
-    list_of_all_panel1_values = list(all_panel1_values)
-    list_of_all_panel2_values = list(all_panel2_values)
-    # Format Panel_1 and Panel_2 values for display
-    formatted_panel1_values = format_values_for_display(list_of_all_panel1_values)
-    formatted_panel2_values = format_values_for_display(list_of_all_panel2_values)
-# Unique Panel_1 and Panel_2 values
-st.markdown("#### Unique Panel values")
-# Display Panel_1 and Panel_2 values
-with st.expander("Unique Panel values"):
-    st.write("")
-    st.markdown(
-        f"""
-    <style>
-    .justify-text {{
-    text-align: justify;
-    }}
-    </style>
-    <div class="justify-text">
-    <strong>Panel Level 1 Values:</strong> {formatted_panel1_values}<br>
-    <strong>Panel Level 2 Values:</strong> {formatted_panel2_values}
-    </div>
-    """,
-        unsafe_allow_html=True,
-    )
-    # Display total Panel_1 and Panel_2
-    st.write("")
-    st.markdown(
-        f"""
-    <div style="text-align: justify;">
-        <strong>Number of Level 1 Panels detected:</strong> {len(list_of_all_panel1_values)}<br>
-        <strong>Number of Level 2 Panels detected:</strong> {len(list_of_all_panel2_values)}
-    </div>
-    """,
-        unsafe_allow_html=True,
-    )
-    st.write("")
-#########################################################################################################################################################
-# Merge all DataFrames
-#########################################################################################################################################################
-# Merge all DataFrames selected
-main_df = create_main_dataframe(
-    files_dict, all_panel1_values, all_panel2_values, granularity_selection
-)
-merged_df = merge_into_main_df(main_df, files_dict, selections)
-# # Display the merged DataFrame
-# st.markdown("#### Merged DataFrame based on selected Panel_1 and Panel_2")
-# st.dataframe(merged_df)
-#########################################################################################################################################################
-# Categorize Variables and Impute Missing Values
-#########################################################################################################################################################
-# Create an editable DataFrame in Streamlit
-st.markdown("#### Select Variables Category & Impute Missing Values")
-# Prepare missing stats DataFrame for editing
-missing_stats_df = prepare_missing_stats_df(merged_df)
-edited_stats_df = st.data_editor(
-    missing_stats_df,
-    column_config={
-        "Impute Method": st.column_config.SelectboxColumn(
-            options=[
-                "Drop Column",
-                "Fill with Mean",
-                "Fill with Median",
-                "Fill with 0",
-            ],
-            required=True,
-            default="Fill with 0",
-        ),
-        "Category": st.column_config.SelectboxColumn(
-            options=[
-                "Media",
-                "Exogenous",
-                "Internal",
-                "Response Metrics",
-            ],
-            required=True,
-            default="Media",
-        ),
-    },
-    disabled=["Column", "Missing Values", "Missing Percentage"],
-    hide_index=True,
-    use_container_width=True,
-)
-# Apply changes based on edited DataFrame
-for i, row in edited_stats_df.iterrows():
-    column = row["Column"]
-    if row["Impute Method"] == "Drop Column":
-        merged_df.drop(columns=[column], inplace=True)
-    elif row["Impute Method"] == "Fill with Mean":
-        merged_df[column].fillna(merged_df[column].mean(), inplace=True)
-    elif row["Impute Method"] == "Fill with Median":
-        merged_df[column].fillna(merged_df[column].median(), inplace=True)
-    elif row["Impute Method"] == "Fill with 0":
-        merged_df[column].fillna(0, inplace=True)
-# Display the Final DataFrame and exogenous variables
-st.markdown("#### Final DataFrame")
-final_df = merged_df
-st.dataframe(final_df, hide_index=True)
-# Initialize an empty dictionary to hold categories and their variables
-category_dict = {}
-# Iterate over each row in the edited DataFrame to populate the dictionary
-for i, row in edited_stats_df.iterrows():
-    column = row["Column"]
-    category = row["Category"]  # The category chosen by the user for this variable
-    # Check if the category already exists in the dictionary
-    if category not in category_dict:
-        # If not, initialize it with the current column as its first element
-        category_dict[category] = [column]
-    else:
-        # If it exists, append the current column to the list of variables under this category
-        category_dict[category].append(column)
-# Add Date, Panel_1 and Panel_12 in category dictionary
-category_dict.update({"Date": ["date"]})
-if "Panel_1" in final_df.columns:
-    category_dict["Panel Level 1"] = ["Panel_1"]
-if "Panel_2" in final_df.columns:
-    category_dict["Panel Level 2"] = ["Panel_2"]
-# Display the dictionary
-st.markdown("#### Variable Category")
-for category, variables in category_dict.items():
-    # Check if there are multiple variables to handle "and" insertion correctly
-    if len(variables) > 1:
-        # Join all but the last variable with ", ", then add " and " before the last variable
-        variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
-    else:
-        # If there's only one variable, no need for "and"
-        variables_str = variables[0]
-    # Display the category and its variables in the desired format
-    st.markdown(
-        f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
-        unsafe_allow_html=True,
-    )
-# Store final dataframe and bin dictionary into session state
-st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
-# Save the DataFrame and dictionary from the session state to the pickle file
-st.write("")
-if st.button("Accept and Save", use_container_width=True):
-    save_to_pickle(
-        "data_import.pkl", st.session_state["final_df"], st.session_state["bin_dict"]
-    )
-    st.toast("💾 Saved Successfully!")