Spaces:

BlendMMM
/

Mastercard

Sleeping

File size: 32,112 Bytes

a660599

# Importing necessary libraries
import streamlit as st

st.set_page_config(
    page_title="Model Build",
    page_icon=":shark:",
    layout="wide",
    initial_sidebar_state="collapsed",
)

import numpy as np
import pandas as pd
from utilities import set_header, load_local_css, load_authenticator
import pickle


load_local_css("styles.css")
set_header()

authenticator = st.session_state.get("authenticator")
if authenticator is None:
    authenticator = load_authenticator()

name, authentication_status, username = authenticator.login("Login", "main")
auth_status = st.session_state.get("authentication_status")

# Check for authentication status
if auth_status != True:
    st.stop()


# Function to validate date column in dataframe
def validate_date_column(df):
    try:
        # Attempt to convert the 'Date' column to datetime
        df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
        return True
    except:
        return False


# Function to determine data interval
def determine_data_interval(common_freq):
    if common_freq == 1:
        return "daily"
    elif common_freq == 7:
        return "weekly"
    elif 28 <= common_freq <= 31:
        return "monthly"
    else:
        return "irregular"


# Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
st.cache_resource(show_spinner=False)


def files_to_dataframes(uploaded_files):
    df_dict = {}
    for uploaded_file in uploaded_files:
        # Extract file name without extension
        file_name = uploaded_file.name.rsplit(".", 1)[0]

        # Check for duplicate file names
        if file_name in df_dict:
            st.warning(
                f"Duplicate File: {file_name}. This file will be skipped.",
                icon="⚠️",
            )
            continue

        # Read the file into a DataFrame
        df = pd.read_excel(uploaded_file)

        # Convert all column names to lowercase
        df.columns = df.columns.str.lower().str.strip()

        # Separate numeric and non-numeric columns
        numeric_cols = list(df.select_dtypes(include=["number"]).columns)
        non_numeric_cols = [
            col
            for col in df.select_dtypes(exclude=["number"]).columns
            if col.lower() != "date"
        ]

        # Check for 'Date' column
        if not (validate_date_column(df) and len(numeric_cols) > 0):
            st.warning(
                f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
                icon="⚠️",
            )
            continue

        # Check for interval
        common_freq = common_freq = (
            pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
        )
        # Calculate the data interval (daily, weekly, monthly or irregular)
        interval = determine_data_interval(common_freq)
        if interval == "irregular":
            st.warning(
                f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
                icon="⚠️",
            )
            continue

        # Store both DataFrames in the dictionary under their respective keys
        df_dict[file_name] = {
            "numeric": numeric_cols,
            "non_numeric": non_numeric_cols,
            "interval": interval,
            "df": df,
        }

    return df_dict


# Function to adjust dataframe granularity
# def adjust_dataframe_granularity(df, current_granularity, target_granularity):
#     # Set index
#     df.set_index("date", inplace=True)

#     # Define aggregation rules for resampling
#     aggregation_rules = {
#         col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
#         for col in df.columns
#     }

#     resampled_df = df
#     if current_granularity == "daily" and target_granularity == "weekly":
#         resampled_df = df.resample("W-MON").agg(aggregation_rules)

#     elif current_granularity == "daily" and target_granularity == "monthly":
#         resampled_df = df.resample("MS").agg(aggregation_rules)

#     elif current_granularity == "daily" and target_granularity == "daily":
#         resampled_df = df.resample("D").agg(aggregation_rules)

#     elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
#         # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
#         expanded_data = []
#         for _, row in df.iterrows():
#             if current_granularity == "weekly":
#                 period_range = pd.date_range(start=row.name, periods=7)
#             elif current_granularity == "monthly":
#                 period_range = pd.date_range(
#                     start=row.name, periods=row.name.days_in_month
#                 )

#             for date in period_range:
#                 new_row = {}
#                 for col in df.columns:
#                     if pd.api.types.is_numeric_dtype(df[col]):
#                         if current_granularity == "weekly":
#                             new_row[col] = row[col] / 7
#                         elif current_granularity == "monthly":
#                             new_row[col] = row[col] / row.name.days_in_month
#                     else:
#                         new_row[col] = row[col]
#                 expanded_data.append((date, new_row))

#         resampled_df = pd.DataFrame(
#             [data for _, data in expanded_data],
#             index=[date for date, _ in expanded_data],
#         )

#     # Reset index
#     resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})

#     return resampled_df


def adjust_dataframe_granularity(df, current_granularity, target_granularity):
    # Set index
    df.set_index("date", inplace=True)

    # Define aggregation rules for resampling
    aggregation_rules = {
        col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
        for col in df.columns
    }

    # Initialize resampled_df
    resampled_df = df
    if current_granularity == "daily" and target_granularity == "weekly":
        resampled_df = df.resample("W-MON", closed="left", label="left").agg(
            aggregation_rules
        )

    elif current_granularity == "daily" and target_granularity == "monthly":
        resampled_df = df.resample("MS", closed="left", label="left").agg(
            aggregation_rules
        )

    elif current_granularity == "daily" and target_granularity == "daily":
        resampled_df = df.resample("D").agg(aggregation_rules)

    elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
        # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
        expanded_data = []
        for _, row in df.iterrows():
            if current_granularity == "weekly":
                period_range = pd.date_range(start=row.name, periods=7)
            elif current_granularity == "monthly":
                period_range = pd.date_range(
                    start=row.name, periods=row.name.days_in_month
                )

            for date in period_range:
                new_row = {}
                for col in df.columns:
                    if pd.api.types.is_numeric_dtype(df[col]):
                        if current_granularity == "weekly":
                            new_row[col] = row[col] / 7
                        elif current_granularity == "monthly":
                            new_row[col] = row[col] / row.name.days_in_month
                    else:
                        new_row[col] = row[col]
                expanded_data.append((date, new_row))

        resampled_df = pd.DataFrame(
            [data for _, data in expanded_data],
            index=[date for date, _ in expanded_data],
        )

    # Reset index
    resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})

    return resampled_df


# Function to clean and extract unique values of DMA and Panel
st.cache_resource(show_spinner=False)


def clean_and_extract_unique_values(files_dict, selections):
    all_dma_values = set()
    all_panel_values = set()

    for file_name, file_data in files_dict.items():
        df = file_data["df"]

        # 'DMA' and 'Panel' selections
        selected_dma = selections[file_name].get("DMA")
        selected_panel = selections[file_name].get("Panel")

        # Clean and standardize DMA column if it exists and is selected
        if selected_dma and selected_dma != "N/A" and selected_dma in df.columns:
            df[selected_dma] = (
                df[selected_dma].str.lower().str.strip().str.replace("_", " ")
            )
            all_dma_values.update(df[selected_dma].dropna().unique())

        # Clean and standardize Panel column if it exists and is selected
        if selected_panel and selected_panel != "N/A" and selected_panel in df.columns:
            df[selected_panel] = (
                df[selected_panel].str.lower().str.strip().str.replace("_", " ")
            )
            all_panel_values.update(df[selected_panel].dropna().unique())

        # Update the processed DataFrame back in the dictionary
        files_dict[file_name]["df"] = df

    return all_dma_values, all_panel_values


# Function to format values for display
st.cache_resource(show_spinner=False)


def format_values_for_display(values_list):
    # Capitalize the first letter of each word and replace underscores with spaces
    formatted_list = [value.replace("_", " ").title() for value in values_list]
    # Join values with commas and 'and' before the last value
    if len(formatted_list) > 1:
        return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
    elif formatted_list:
        return formatted_list[0]
    return "No values available"


# Function to normalizes all data within files_dict to a daily granularity
st.cache(show_spinner=False, allow_output_mutation=True)


def standardize_data_to_daily(files_dict, selections):
    # Normalize all data to a daily granularity using a provided function
    files_dict = apply_granularity_to_all(files_dict, "daily", selections)

    # Update the "interval" attribute for each dataset to indicate the new granularity
    for files_name, files_data in files_dict.items():
        files_data["interval"] = "daily"

    return files_dict


# Function to apply granularity transformation to all DataFrames in files_dict
st.cache_resource(show_spinner=False)


def apply_granularity_to_all(files_dict, granularity_selection, selections):
    for file_name, file_data in files_dict.items():
        df = file_data["df"].copy()

        # Handling when DMA or Panel might be 'N/A'
        selected_dma = selections[file_name].get("DMA")
        selected_panel = selections[file_name].get("Panel")

        # Correcting the segment selection logic & handling 'N/A'
        if selected_dma != "N/A" and selected_panel != "N/A":
            unique_combinations = df[[selected_dma, selected_panel]].drop_duplicates()
        elif selected_dma != "N/A":
            unique_combinations = df[[selected_dma]].drop_duplicates()
            selected_panel = None  # Ensure Panel is ignored if N/A
        elif selected_panel != "N/A":
            unique_combinations = df[[selected_panel]].drop_duplicates()
            selected_dma = None  # Ensure DMA is ignored if N/A
        else:
            # If both are 'N/A', process the entire dataframe as is
            df = adjust_dataframe_granularity(
                df, file_data["interval"], granularity_selection
            )
            files_dict[file_name]["df"] = df
            continue  # Skip to the next file

        transformed_segments = []
        for _, combo in unique_combinations.iterrows():
            if selected_dma and selected_panel:
                segment = df[
                    (df[selected_dma] == combo[selected_dma])
                    & (df[selected_panel] == combo[selected_panel])
                ]
            elif selected_dma:
                segment = df[df[selected_dma] == combo[selected_dma]]
            elif selected_panel:
                segment = df[df[selected_panel] == combo[selected_panel]]

            # Adjust granularity of the segment
            transformed_segment = adjust_dataframe_granularity(
                segment, file_data["interval"], granularity_selection
            )
            transformed_segments.append(transformed_segment)

        # Combine all transformed segments into a single DataFrame for this file
        transformed_df = pd.concat(transformed_segments, ignore_index=True)
        files_dict[file_name]["df"] = transformed_df

    return files_dict


# Function to create main dataframe structure
st.cache_resource(show_spinner=False)


def create_main_dataframe(
    files_dict, all_dma_values, all_panel_values, granularity_selection
):
    # Determine the global start and end dates across all DataFrames
    global_start = min(df["df"]["date"].min() for df in files_dict.values())
    global_end = max(df["df"]["date"].max() for df in files_dict.values())

    # Adjust the date_range generation based on the granularity_selection
    if granularity_selection == "weekly":
        # Generate a weekly range, with weeks starting on Monday
        date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
    elif granularity_selection == "monthly":
        # Generate a monthly range, starting from the first day of each month
        date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
    else:  # Default to daily if not weekly or monthly
        date_range = pd.date_range(start=global_start, end=global_end, freq="D")

    # Collect all unique DMA and Panel values, excluding 'N/A'
    all_dmas = all_dma_values
    all_panels = all_panel_values

    # Dynamically build the list of dimensions (Panel, DMA) to include in the main DataFrame based on availability
    dimensions, merge_keys = [], []
    if all_panels:
        dimensions.append(all_panels)
        merge_keys.append("Panel")
    if all_dmas:
        dimensions.append(all_dmas)
        merge_keys.append("DMA")

    dimensions.append(date_range)  # Date range is always included
    merge_keys.append("date")  # Date range is always included

    # Create a main DataFrame template with the dimensions
    main_df = pd.MultiIndex.from_product(
        dimensions,
        names=[name for name, _ in zip(merge_keys, dimensions)],
    ).to_frame(index=False)

    return main_df.reset_index(drop=True)


# Function to prepare and merge dataFrames
st.cache_resource(show_spinner=False)


def merge_into_main_df(main_df, files_dict, selections):
    for file_name, file_data in files_dict.items():
        df = file_data["df"].copy()

        # Rename selected DMA and Panel columns if not 'N/A'
        selected_dma = selections[file_name].get("DMA", "N/A")
        selected_panel = selections[file_name].get("Panel", "N/A")
        if selected_dma != "N/A":
            df.rename(columns={selected_dma: "DMA"}, inplace=True)
        if selected_panel != "N/A":
            df.rename(columns={selected_panel: "Panel"}, inplace=True)

        # Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel' and 'DMA'
        merge_keys = ["date"]
        if "Panel" in df.columns:
            merge_keys.append("Panel")
        if "DMA" in df.columns:
            merge_keys.append("DMA")
        main_df = pd.merge(main_df, df, on=merge_keys, how="left")

    # After all merges, sort by 'date' and reset index for cleanliness
    sort_by = ["date"]
    if "Panel" in main_df.columns:
        sort_by.append("Panel")
    if "DMA" in main_df.columns:
        sort_by.append("DMA")
    main_df.sort_values(by=sort_by, inplace=True)
    main_df.reset_index(drop=True, inplace=True)

    return main_df


# Function to categorize column
def categorize_column(column_name):
    # Define keywords for each category
    internal_keywords = [
        "Price",
        "Discount",
        "product_price",
        "cost",
        "margin",
        "inventory",
        "sales",
        "revenue",
        "turnover",
        "expense",
    ]
    exogenous_keywords = [
        "GDP",
        "Tax",
        "Inflation",
        "interest_rate",
        "employment_rate",
        "exchange_rate",
        "consumer_spending",
        "retail_sales",
        "oil_prices",
        "weather",
    ]

    # Check if the column name matches any of the keywords for Internal or Exogenous categories
    for keyword in internal_keywords:
        if keyword.lower() in column_name.lower():
            return "Internal"
    for keyword in exogenous_keywords:
        if keyword.lower() in column_name.lower():
            return "Exogenous"

    # Default to Media if no match found
    return "Media"


# Function to calculate missing stats and prepare for editable DataFrame
st.cache_resource(show_spinner=False)


def prepare_missing_stats_df(df):
    missing_stats = []
    for column in df.columns:
        if (
            column == "date" or column == "DMA" or column == "Panel"
        ):  # Skip Date, DMA and Panel column
            continue

        missing = df[column].isnull().sum()
        pct_missing = round((missing / len(df)) * 100, 2)

        # Dynamically assign category based on column name
        # category = categorize_column(column)
        category = "Media"

        missing_stats.append(
            {
                "Column": column,
                "Missing Values": missing,
                "Missing Percentage": pct_missing,
                "Impute Method": "Fill with 0",  # Default value
                "Category": category,
            }
        )
    stats_df = pd.DataFrame(missing_stats)

    return stats_df


# Function to add API DataFrame details to the files dictionary
st.cache_resource(show_spinner=False)


def add_api_dataframe_to_dict(main_df, files_dict):
    files_dict["API"] = {
        "numeric": list(main_df.select_dtypes(include=["number"]).columns),
        "non_numeric": [
            col
            for col in main_df.select_dtypes(exclude=["number"]).columns
            if col.lower() != "date"
        ],
        "interval": determine_data_interval(
            pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
        ),
        "df": main_df,
    }

    return files_dict


# Function to reads an API into a DataFrame, parsing specified columns as datetime
@st.cache_resource(show_spinner=False)
def read_API_data():
    return pd.read_excel(r"upf_data_converted.xlsx", parse_dates=["Date"])


# Function to set the 'DMA_Panel_Selected' session state variable to False
def set_DMA_Panel_Selected_false():
    st.session_state["DMA_Panel_Selected"] = False


# Initialize 'final_df' in session state
if "final_df" not in st.session_state:
    st.session_state["final_df"] = pd.DataFrame()

# Initialize 'bin_dict' in session state
if "bin_dict" not in st.session_state:
    st.session_state["bin_dict"] = {}

# Initialize 'DMA_Panel_Selected' in session state
if "DMA_Panel_Selected" not in st.session_state:
    st.session_state["DMA_Panel_Selected"] = False

# Page Title
st.write("")  # Top padding
st.title("Data Import")


#########################################################################################################################################################
# Create a dictionary to hold all DataFrames and collect user input to specify "DMA" and "Panel" columns for each file
#########################################################################################################################################################


# Read the Excel file, parsing 'Date' column as datetime
main_df = read_API_data()

# Convert all column names to lowercase
main_df.columns = main_df.columns.str.lower().str.strip()

# File uploader
uploaded_files = st.file_uploader(
    "Upload additional data",
    type=["xlsx"],
    accept_multiple_files=True,
    on_change=set_DMA_Panel_Selected_false,
)

# Custom HTML for upload instructions
recommendation_html = f"""
<div style="text-align: justify;">
<strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including DMA, Panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
</div>
"""
st.markdown(recommendation_html, unsafe_allow_html=True)

# Choose Date Granularity
st.markdown("#### Choose Date Granularity")
# Granularity Selection
granularity_selection = st.selectbox(
    "Choose Date Granularity",
    ["Daily", "Weekly", "Monthly"],
    label_visibility="collapsed",
    on_change=set_DMA_Panel_Selected_false,
)
granularity_selection = str(granularity_selection).lower()

# Convert files to dataframes
files_dict = files_to_dataframes(uploaded_files)

# Add API Dataframe
if main_df is not None:
    files_dict = add_api_dataframe_to_dict(main_df, files_dict)

# Display a warning message if no files have been uploaded and halt further execution
if not files_dict:
    st.warning(
        "Please upload at least one file to proceed.",
        icon="⚠️",
    )
    st.stop()  # Halts further execution until file is uploaded


# Select DMA and Panel columns
st.markdown("#### Select DMA and Panel columns")
selections = {}
with st.expander("Select DMA and Panel columns", expanded=False):
    count = 0  # Initialize counter to manage the visibility of labels and keys
    for file_name, file_data in files_dict.items():
        # Determine visibility of the label based on the count
        if count == 0:
            label_visibility = "visible"
        else:
            label_visibility = "collapsed"

        # Extract non-numeric columns
        non_numeric_cols = file_data["non_numeric"]

        # Prepare DMA and Panel values for dropdown, adding "N/A" as an option
        dma_values = non_numeric_cols + ["N/A"]
        panel_values = non_numeric_cols + ["N/A"]

        # Skip if only one option is available
        if len(dma_values) == 1 and len(panel_values) == 1:
            selected_dma, selected_panel = "N/A", "N/A"
            # Update the selections for DMA and Panel for the current file
            selections[file_name] = {
                "DMA": selected_dma,
                "Panel": selected_panel,
            }
            continue

        # Create layout columns for File Name, DMA, and Panel selections
        file_name_col, DMA_col, Panel_col = st.columns([2, 4, 4])

        with file_name_col:
            # Display "File Name" label only for the first file
            if count == 0:
                st.write("File Name")
            else:
                st.write("")
            st.write(file_name)  # Display the file name

        with DMA_col:
            # Display a selectbox for DMA values
            selected_dma = st.selectbox(
                "Select DMA",
                dma_values,
                on_change=set_DMA_Panel_Selected_false,
                label_visibility=label_visibility,  # Control visibility of the label
                key=f"DMA_selectbox{count}",  # Ensure unique key for each selectbox
            )

        with Panel_col:
            # Display a selectbox for Panel values
            selected_panel = st.selectbox(
                "Select Panel",
                panel_values,
                on_change=set_DMA_Panel_Selected_false,
                label_visibility=label_visibility,  # Control visibility of the label
                key=f"Panel_selectbox{count}",  # Ensure unique key for each selectbox
            )

        # Skip processing if the same column is selected for both Panel and DMA due to potential data integrity issues
        if selected_panel == selected_dma and not (
            selected_panel == "N/A" and selected_dma == "N/A"
        ):
            st.warning(
                f"File: {file_name} → The same column cannot serve as both Panel and DMA. Please adjust your selections.",
            )
            selected_dma, selected_panel = "N/A", "N/A"
            st.stop()

        # Update the selections for DMA and Panel for the current file
        selections[file_name] = {
            "DMA": selected_dma,
            "Panel": selected_panel,
        }

        count += 1  # Increment the counter after processing each file

    # Accept DMA and Panel selection
    if st.button("Accept and Process", use_container_width=True):

        # Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
        with st.spinner("Processing...", cache=True):
            files_dict = standardize_data_to_daily(files_dict, selections)

            # Convert all data to daily level granularity
            files_dict = apply_granularity_to_all(
                files_dict, granularity_selection, selections
            )

        st.session_state["files_dict"] = files_dict
        st.session_state["DMA_Panel_Selected"] = True


#########################################################################################################################################################
# Display unique DMA and Panel values
#########################################################################################################################################################


# Halts further execution until DMA and Panel columns are selected
if "files_dict" in st.session_state and st.session_state["DMA_Panel_Selected"]:
    files_dict = st.session_state["files_dict"]
else:
    st.stop()

# Set to store unique values of DMA and Panel
with st.spinner("Fetching DMA and Panel values..."):
    all_dma_values, all_panel_values = clean_and_extract_unique_values(
        files_dict, selections
    )

    # List of DMA and Panel columns unique values
    list_of_all_dma_values = list(all_dma_values)
    list_of_all_panel_values = list(all_panel_values)

    # Format DMA and Panel values for display
    formatted_dma_values = format_values_for_display(list_of_all_dma_values)
    formatted_panel_values = format_values_for_display(list_of_all_panel_values)

# Unique DMA and Panel values
st.markdown("#### Unique DMA and Panel values")
# Display DMA and Panel values
with st.expander("Unique DMA and Panel values"):
    st.write("")
    st.markdown(
        f"""
    <style>
    .justify-text {{
    text-align: justify;
    }}
    </style>
    <div class="justify-text">
    <strong>Panel Values:</strong> {formatted_panel_values}<br>
    <strong>DMA Values:</strong> {formatted_dma_values}
    </div>
    """,
        unsafe_allow_html=True,
    )

    # Display total DMA and Panel
    st.write("")
    st.markdown(
        f"""
    <div style="text-align: justify;">
        <strong>Number of DMAs detected:</strong> {len(list_of_all_dma_values)}<br>
        <strong>Number of Panels detected:</strong> {len(list_of_all_panel_values)}
    </div>
    """,
        unsafe_allow_html=True,
    )
    st.write("")


#########################################################################################################################################################
# Merge all DataFrames
#########################################################################################################################################################


# Merge all DataFrames selected
main_df = create_main_dataframe(
    files_dict, all_dma_values, all_panel_values, granularity_selection
)
merged_df = merge_into_main_df(main_df, files_dict, selections)

# # Display the merged DataFrame
# st.markdown("#### Merged DataFrame based on selected DMA and Panel")
# st.dataframe(merged_df)


#########################################################################################################################################################
# Categorize Variables and Impute Missing Values
#########################################################################################################################################################


# Create an editable DataFrame in Streamlit
st.markdown("#### Select Variables Category & Impute Missing Values")

# Prepare missing stats DataFrame for editing
missing_stats_df = prepare_missing_stats_df(merged_df)

edited_stats_df = st.data_editor(
    missing_stats_df,
    column_config={
        "Impute Method": st.column_config.SelectboxColumn(
            options=[
                "Drop Column",
                "Fill with Mean",
                "Fill with Median",
                "Fill with 0",
            ],
            required=True,
            default="Fill with 0",
        ),
        "Category": st.column_config.SelectboxColumn(
            options=[
                "Media",
                "Exogenous",
                "Internal",
                "Response_Metric"
            ],
            required=True,
            default="Media",
        ),
    },
    disabled=["Column", "Missing Values", "Missing Percentage"],
    hide_index=True,
    use_container_width=True,
)

# Apply changes based on edited DataFrame
for i, row in edited_stats_df.iterrows():
    column = row["Column"]
    if row["Impute Method"] == "Drop Column":
        merged_df.drop(columns=[column], inplace=True)

    elif row["Impute Method"] == "Fill with Mean":
        merged_df[column].fillna(merged_df[column].mean(), inplace=True)

    elif row["Impute Method"] == "Fill with Median":
        merged_df[column].fillna(merged_df[column].median(), inplace=True)

    elif row["Impute Method"] == "Fill with 0":
        merged_df[column].fillna(0, inplace=True)

# Display the Final DataFrame and exogenous variables
st.markdown("#### Final DataFrame")
final_df = merged_df
st.dataframe(final_df, hide_index=True)

# Initialize an empty dictionary to hold categories and their variables
category_dict = {}

# Iterate over each row in the edited DataFrame to populate the dictionary
for i, row in edited_stats_df.iterrows():
    column = row["Column"]
    category = row["Category"]  # The category chosen by the user for this variable

    # Check if the category already exists in the dictionary
    if category not in category_dict:
        # If not, initialize it with the current column as its first element
        category_dict[category] = [column]
    else:
        # If it exists, append the current column to the list of variables under this category
        category_dict[category].append(column)

# Add Date, DMA and Panel in category dictionary
category_dict.update({"Date": ["date"]})
if "DMA" in final_df.columns:
    category_dict["DMA"] = ["DMA"]

if "Panel" in final_df.columns:
    category_dict["Panel"] = ["Panel"]

# Display the dictionary
st.markdown("#### Variable Category")
for category, variables in category_dict.items():
    # Check if there are multiple variables to handle "and" insertion correctly
    if len(variables) > 1:
        # Join all but the last variable with ", ", then add " and " before the last variable
        variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
    else:
        # If there's only one variable, no need for "and"
        variables_str = variables[0]

    # Display the category and its variables in the desired format
    st.markdown(
        f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
        unsafe_allow_html=True,
    )

# Store final dataframe and bin dictionary into session state
st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict

if st.button('Save Changes'):
    
    with open("Pickle_files/main_df", 'wb') as f:
        pickle.dump(st.session_state["final_df"], f)
    with open("Pickle_files/category_dict",'wb') as c:
        pickle.dump(st.session_state["bin_dict"],c)
    st.success('Changes Saved!')