Spaces:

BlendMMM
/

Mastercard

Sleeping

App Files Files Community

BlendMMM commited on Mar 14, 2024

Commit

6641078

verified ·

1 Parent(s): 56ff7b3

Upload Data_Import.py

Browse files

Files changed (1) hide show

Data_Import.py +825 -318

Data_Import.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # Importing necessary libraries
 import streamlit as st
-import pickle
 st.set_page_config(
     page_title="Model Build",
@@ -9,376 +8,884 @@ st.set_page_config(
     initial_sidebar_state="collapsed",
 )
-from utilities import load_authenticator
 import numpy as np
 import pandas as pd
-from utilities import set_header, load_local_css
 load_local_css("styles.css")
 set_header()
-for k, v in st.session_state.items():
-    if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
-        st.session_state[k] = v
-authenticator = st.session_state.get('authenticator')
 if authenticator is None:
     authenticator = load_authenticator()
-name, authentication_status, username = authenticator.login('Login', 'main')
-auth_status = st.session_state.get('authentication_status')
-if auth_status == True:
-    is_state_initiaized = st.session_state.get('initialized',False)
-    if not is_state_initiaized:
-        a=1
-    # Function to expand dataframe to daily
-    @st.cache_resource(show_spinner=False)
-    def expand_to_daily(df, granularity, start_date, end_date):
-        # Create a new DataFrame with a row for each day
-        all_dates = pd.date_range(start=start_date, end=end_date, freq="D")
-        daily_df = pd.DataFrame(all_dates, columns=["Date"])
-        if granularity == "daily":
-            # For daily data, simply merge to fill missing dates
-            daily_df = daily_df.merge(df, on="Date", how="left")
-        else:
-            # For weekly or monthly, distribute values to daily rows
-            for column in df.columns:
-                if column != "Date":  # Skip 'Date' column
-                    daily_df[column] = np.nan  # Initialize with NaNs
-            # Group by the required frequency and distribute values
-            freq = "W-MON" if granularity == "weekly" else "MS"
-            for _, group in df.groupby(pd.Grouper(key="Date", freq=freq)):
-                num_days = len(
-                    pd.date_range(group["Date"].min(), group["Date"].max(), freq="D")
                 )
-                for column in group.columns:
-                    if column == "Date":  # Skip 'Date' column
-                        continue
-                    value = group[column].sum() / num_days
-                    date_range = pd.date_range(
-                        group["Date"].min(), periods=num_days, freq="D"
-                    )
-                    daily_df.loc[daily_df["Date"].isin(date_range), column] = value
-        return daily_df
-    # Function to validate date column in dataframe
-    def validate_date_column(df):
-        try:
-            # Attempt to convert the 'Date' column to datetime
-            df["Date"] = pd.to_datetime(df["Date"], format="%d-%m-%Y")
-            return True
-        except:
-            return False
-    # Function to determine data interval
-    def determine_data_interval(common_freq):
-        if common_freq == 1:
-            return "daily"
-        elif common_freq == 7:
-            return "weekly"
-        elif 28 <= common_freq <= 31:
-            return "monthly"
-        else:
-            return "irregular"
-    # Function to convert and fill dates in dataframe
-    def convert_and_fill_dates(df, start_date, end_date, interval):
-        # Create a date range for the desired period
-        all_dates = pd.date_range(start=start_date, end=end_date, freq="D")
-        new_df = pd.DataFrame(all_dates, columns=["Date"])
-        # Preprocess and aggregate data based on the original interval
-        if interval != "daily":
-            # Resample to start of each week/month, then sum values for the same period
-            if interval == "weekly":
-                df = df.resample("W-MON", on="Date").sum().reset_index()
-            elif interval == "monthly":
-                df = df.resample("MS", on="Date").sum().reset_index()
-            # Distribute values equally across the days in each week/month
-            expanded_rows = []
-            for _, row in df.iterrows():
-                if interval == "weekly":
-                    period_dates = pd.date_range(row["Date"], periods=7)
-                elif interval == "monthly":
-                    period_end = row["Date"] + pd.offsets.MonthEnd(1)
-                    period_dates = pd.date_range(row["Date"], period_end)
-                for date in period_dates:
-                    new_row = row.copy()
-                    new_row["Date"] = date
-                    for col in df.columns:
-                        if col != "Date":  # Skip 'Date' column
-                            new_row[col] = row[col] / len(period_dates)
-                    expanded_rows.append(new_row)
-            # Create a DataFrame from expanded rows
-            expanded_df = pd.DataFrame(expanded_rows)
-            new_df = pd.merge(new_df, expanded_df, how="left", on="Date")
-        else:
-            # Daily data, aggregate if there are multiple entries for the same day
-            df = df.groupby("Date").sum().reset_index()
-            new_df = pd.merge(new_df, df, how="left", on="Date")
-        # Ensure all dates from start to end are present, filling missing values with NaN
-        new_df["Date"] = pd.to_datetime(new_df["Date"])  # Ensure 'Date' is datetime type
-        new_df = new_df.set_index("Date").reindex(all_dates).reset_index()
-        new_df.rename(columns={"index": "Date"}, inplace=True)
-        return new_df
-    # Function to convert a DataFrame from daily level granularity to either weekly or monthly level
-    def convert_to_higher_granularity(df, required_granularity):
-        if required_granularity == "daily":
-            return df
-        # Ensure 'Date' is the index and is in datetime format
-        if not pd.api.types.is_datetime64_any_dtype(df["Date"]):
-            df["Date"] = pd.to_datetime(df["Date"])
-        df.set_index("Date", inplace=True)
-        # Resample and aggregate
-        if required_granularity == "weekly":
-            # Resample to weekly, using 'W-MON' to indicate weeks starting on Monday
-            df = df.resample("W-MON").sum()
-        elif required_granularity == "monthly":
-            # Resample to monthly, using 'MS' to indicate month start
-            df = df.resample("MS").sum()
-        # Reset index to move 'Date' back to a column
-        df.reset_index(inplace=True)
-        return df
-    # # Read the CSV file, parsing 'Date' column as datetime
-    main_df = pd.read_csv("Media_data_for_model_dma_level.csv", dayfirst=True, parse_dates=["Date"])
-    # st.write(main_df)
-    # Get the start date (minimum) and end date (maximum) from the 'Date' column
-    api_start_date = main_df["Date"].min()
-    api_end_date = main_df["Date"].max()
-    # Infer the granularity from the most common difference between consecutive dates
-    date_diffs = main_df["Date"].diff().dt.days.dropna()
-    common_diff = date_diffs.mode()[0]
-    api_granularity = determine_data_interval(common_diff)
-    # Convert the DataFrame to daily level granularity
-    main_df = expand_to_daily(main_df, api_granularity, api_start_date, api_end_date)
-    # Page Title
-    st.title("Data Import")
-    # File uploader
-    uploaded_files = st.file_uploader(
-        "Upload additional data", type=["xlsx"], accept_multiple_files=True
-    )
-    # Custom HTML for upload instructions
-    recommendation_html = f"""
-    <div style="text-align: justify;">
-    <strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values, and aggregated to a {api_granularity} level.
-    </div>
-    """
-    st.markdown(recommendation_html, unsafe_allow_html=True)
-    # Initialize a list to collect all processed DataFrames
-    all_data_dfs = []
-    if uploaded_files:
-        for uploaded_file in uploaded_files:
-            # Extract the file name
-            file_name = uploaded_file.name
-            # Load the file into a DataFrame
-            data_df = pd.read_excel(
-                uploaded_file,
             )
-            # Identify numeric columns in the DataFrame
-            numeric_columns = data_df.select_dtypes(include="number").columns.tolist()
-            # Validate the 'Date' column and ensure there's at least one numeric column
-            if validate_date_column(data_df) and len(numeric_columns) > 0:
-                data_df = data_df[["Date"] + numeric_columns]
-                # Ensure the 'Date' column is in datetime format and sorted
-                data_df["Date"] = pd.to_datetime(data_df["Date"], dayfirst=True)
-                data_df.sort_values("Date", inplace=True)
-                # Calculate the most common day difference between dates to determine frequency
-                common_freq = data_df["Date"].diff().dt.days.dropna().mode()[0]
-                # Calculate the data interval (daily, weekly, monthly or irregular)
-                interval = determine_data_interval(common_freq)
-                if interval == "irregular":
-                    # Warn the user if the 'Date' column doesn't meet the format requirements
-                    st.warning(
-                        f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval."
-                    )
-                    continue
-                # Convert data to specified interval and redistribute to daily
-                data_df = convert_and_fill_dates(
-                    data_df, api_start_date, api_end_date, interval
-                )
-                # Add the processed DataFrame to the list
-                all_data_dfs.append(data_df)
             else:
-                # Warn the user if the 'Date' column doesn't meet the format requirements
-                st.warning(
-                    f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column."
-                )
-    # Sequentially merge each of the other DataFrames with the main DataFrame on 'Date'
-    for df in all_data_dfs:
-        main_df = pd.merge(main_df, df, on="Date", how="left")
-    # Function to calculate missing stats and prepare for editable DataFrame
-    def prepare_missing_stats_df(df):
-        missing_stats = []
-        for column in df.columns:
-            if (
-                column == "Date" or column == "Total Approved Accounts - Revenue"
-            ):  # Skip Date and Revenue column
-                continue
-            missing = df[column].isnull().sum()
-            pct_missing = round((missing / len(df)) * 100, 2)
-            missing_stats.append(
-                {
-                    "Column": column,
-                    "Missing Values": missing,
-                    "Missing Percentage": pct_missing,
-                    "Impute Method": "Fill with 0",  # Default value
-                    "Category": "Media",  # Default value
-                }
             )
-        stats_df = pd.DataFrame(missing_stats)
-        return stats_df
-    # Prepare missing stats DataFrame for editing
-    missing_stats_df = prepare_missing_stats_df(main_df)
-    # Create an editable DataFrame in Streamlit
-    st.markdown("#### Select Variables Category & Impute Missing Values")
-    edited_stats_df = st.data_editor(
-        missing_stats_df,
-        column_config={
-            "Impute Method": st.column_config.SelectboxColumn(
-                options=[
-                    "Drop Column",
-                    "Fill with Mean",
-                    "Fill with Median",
-                    "Fill with 0",
-                ],
-                required=True,
-                default="Fill with 0",
-            ),
-            "Category": st.column_config.SelectboxColumn(
-                options=[
-                    "Date",
-                    "Media",
-                    "Exogenous",
-                    "Internal",
-                    "DMA/Panel",
-                    "Response_Metric"
-                ],
-                required=True,
-                default="Media",
-            ),
-        },
-        disabled=["Column", "Missing Values", "Missing Percentage"],
-        hide_index=True,
-        use_container_width=True,
     )
-    # Apply changes based on edited DataFrame
-    for i, row in edited_stats_df.iterrows():
-        column = row["Column"]
-        if row["Impute Method"] == "Drop Column":
-            main_df.drop(columns=[column], inplace=True)
-        elif row["Impute Method"] == "Fill with Mean":
-            main_df[column].fillna(main_df[column].mean(), inplace=True)
-        elif row["Impute Method"] == "Fill with Median":
-            main_df[column].fillna(main_df[column].median(), inplace=True)
-        elif row["Impute Method"] == "Fill with 0":
-            main_df[column].fillna(0, inplace=True)
-    # Convert the Final DataFrame to required granularity
-    main_df = convert_to_higher_granularity(main_df, api_granularity)
-    # Display the Final DataFrame and exogenous variables
-    st.markdown("#### Final DataFrame:")
-    st.dataframe(main_df)
-    # Initialize an empty dictionary to hold categories and their variables
-    category_dict = {}
-    # Iterate over each row in the edited DataFrame to populate the dictionary
-    for i, row in edited_stats_df.iterrows():
-        column = row["Column"]
-        category = row["Category"]  # The category chosen by the user for this variable
-        # Check if the category already exists in the dictionary
-        if category not in category_dict:
-            # If not, initialize it with the current column as its first element
-            category_dict[category] = [column]
-        else:
-            # If it exists, append the current column to the list of variables under this category
-            category_dict[category].append(column)
-    # Display the dictionary
-    st.markdown("#### Variable Category:")
-    for category, variables in category_dict.items():
-        # Check if there are multiple variables to handle "and" insertion correctly
-        if len(variables) > 1:
-            # Join all but the last variable with ", ", then add " and " before the last variable
-            variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
-        else:
-            # If there's only one variable, no need for "and"
-            variables_str = variables[0]
-        # Display the category and its variables in the desired format
-        st.markdown(f"**{category}:** {variables_str}\n\n", unsafe_allow_html=True)
-    # storing maindf and categories in session_state
-    # st.write(main_df)
-    # st.session_state['Cleaned_data']=main_df
-    # st.session_state['category_dict']=category_dict
-    if st.button('Save Changes'):
-        with open("Pickle_files/main_df", 'wb') as f:
-            pickle.dump(main_df, f)
-        with open("Pickle_files/category_dict",'wb') as c:
-            pickle.dump(category_dict,c)
-        st.success('Changes Saved!')

 # Importing necessary libraries
 import streamlit as st
 st.set_page_config(
     page_title="Model Build",
     initial_sidebar_state="collapsed",
 )
 import numpy as np
 import pandas as pd
+from utilities import set_header, load_local_css, load_authenticator
+import pickle
 load_local_css("styles.css")
 set_header()
+authenticator = st.session_state.get("authenticator")
 if authenticator is None:
     authenticator = load_authenticator()
+name, authentication_status, username = authenticator.login("Login", "main")
+auth_status = st.session_state.get("authentication_status")
+# Check for authentication status
+if auth_status != True:
+    st.stop()
+# Function to validate date column in dataframe
+def validate_date_column(df):
+    try:
+        # Attempt to convert the 'Date' column to datetime
+        df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
+        return True
+    except:
+        return False
+# Function to determine data interval
+def determine_data_interval(common_freq):
+    if common_freq == 1:
+        return "daily"
+    elif common_freq == 7:
+        return "weekly"
+    elif 28 <= common_freq <= 31:
+        return "monthly"
+    else:
+        return "irregular"
+# Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
+st.cache_resource(show_spinner=False)
+def files_to_dataframes(uploaded_files):
+    df_dict = {}
+    for uploaded_file in uploaded_files:
+        # Extract file name without extension
+        file_name = uploaded_file.name.rsplit(".", 1)[0]
+        # Check for duplicate file names
+        if file_name in df_dict:
+            st.warning(
+                f"Duplicate File: {file_name}. This file will be skipped.",
+                icon="⚠️",
+            )
+            continue
+        # Read the file into a DataFrame
+        df = pd.read_excel(uploaded_file)
+        # Convert all column names to lowercase
+        df.columns = df.columns.str.lower().str.strip()
+        # Separate numeric and non-numeric columns
+        numeric_cols = list(df.select_dtypes(include=["number"]).columns)
+        non_numeric_cols = [
+            col
+            for col in df.select_dtypes(exclude=["number"]).columns
+            if col.lower() != "date"
+        ]
+        # Check for 'Date' column
+        if not (validate_date_column(df) and len(numeric_cols) > 0):
+            st.warning(
+                f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
+                icon="⚠️",
+            )
+            continue
+        # Check for interval
+        common_freq = common_freq = (
+            pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
+        )
+        # Calculate the data interval (daily, weekly, monthly or irregular)
+        interval = determine_data_interval(common_freq)
+        if interval == "irregular":
+            st.warning(
+                f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
+                icon="⚠️",
+            )
+            continue
+        # Store both DataFrames in the dictionary under their respective keys
+        df_dict[file_name] = {
+            "numeric": numeric_cols,
+            "non_numeric": non_numeric_cols,
+            "interval": interval,
+            "df": df,
+        }
+    return df_dict
+# Function to adjust dataframe granularity
+# def adjust_dataframe_granularity(df, current_granularity, target_granularity):
+#     # Set index
+#     df.set_index("date", inplace=True)
+#     # Define aggregation rules for resampling
+#     aggregation_rules = {
+#         col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
+#         for col in df.columns
+#     }
+#     resampled_df = df
+#     if current_granularity == "daily" and target_granularity == "weekly":
+#         resampled_df = df.resample("W-MON").agg(aggregation_rules)
+#     elif current_granularity == "daily" and target_granularity == "monthly":
+#         resampled_df = df.resample("MS").agg(aggregation_rules)
+#     elif current_granularity == "daily" and target_granularity == "daily":
+#         resampled_df = df.resample("D").agg(aggregation_rules)
+#     elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
+#         # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
+#         expanded_data = []
+#         for _, row in df.iterrows():
+#             if current_granularity == "weekly":
+#                 period_range = pd.date_range(start=row.name, periods=7)
+#             elif current_granularity == "monthly":
+#                 period_range = pd.date_range(
+#                     start=row.name, periods=row.name.days_in_month
+#                 )
+#             for date in period_range:
+#                 new_row = {}
+#                 for col in df.columns:
+#                     if pd.api.types.is_numeric_dtype(df[col]):
+#                         if current_granularity == "weekly":
+#                             new_row[col] = row[col] / 7
+#                         elif current_granularity == "monthly":
+#                             new_row[col] = row[col] / row.name.days_in_month
+#                     else:
+#                         new_row[col] = row[col]
+#                 expanded_data.append((date, new_row))
+#         resampled_df = pd.DataFrame(
+#             [data for _, data in expanded_data],
+#             index=[date for date, _ in expanded_data],
+#         )
+#     # Reset index
+#     resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
+#     return resampled_df
+def adjust_dataframe_granularity(df, current_granularity, target_granularity):
+    # Set index
+    df.set_index("date", inplace=True)
+    # Define aggregation rules for resampling
+    aggregation_rules = {
+        col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
+        for col in df.columns
+    }
+    # Initialize resampled_df
+    resampled_df = df
+    if current_granularity == "daily" and target_granularity == "weekly":
+        resampled_df = df.resample("W-MON", closed="left", label="left").agg(
+            aggregation_rules
+        )
+    elif current_granularity == "daily" and target_granularity == "monthly":
+        resampled_df = df.resample("MS", closed="left", label="left").agg(
+            aggregation_rules
+        )
+    elif current_granularity == "daily" and target_granularity == "daily":
+        resampled_df = df.resample("D").agg(aggregation_rules)
+    elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
+        # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
+        expanded_data = []
+        for _, row in df.iterrows():
+            if current_granularity == "weekly":
+                period_range = pd.date_range(start=row.name, periods=7)
+            elif current_granularity == "monthly":
+                period_range = pd.date_range(
+                    start=row.name, periods=row.name.days_in_month
                 )
+            for date in period_range:
+                new_row = {}
+                for col in df.columns:
+                    if pd.api.types.is_numeric_dtype(df[col]):
+                        if current_granularity == "weekly":
+                            new_row[col] = row[col] / 7
+                        elif current_granularity == "monthly":
+                            new_row[col] = row[col] / row.name.days_in_month
+                    else:
+                        new_row[col] = row[col]
+                expanded_data.append((date, new_row))
+        resampled_df = pd.DataFrame(
+            [data for _, data in expanded_data],
+            index=[date for date, _ in expanded_data],
+        )
+    # Reset index
+    resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
+    return resampled_df
+# Function to clean and extract unique values of DMA and Panel
+st.cache_resource(show_spinner=False)
+def clean_and_extract_unique_values(files_dict, selections):
+    all_dma_values = set()
+    all_panel_values = set()
+    for file_name, file_data in files_dict.items():
+        df = file_data["df"]
+        # 'DMA' and 'Panel' selections
+        selected_dma = selections[file_name].get("DMA")
+        selected_panel = selections[file_name].get("Panel")
+        # Clean and standardize DMA column if it exists and is selected
+        if selected_dma and selected_dma != "N/A" and selected_dma in df.columns:
+            df[selected_dma] = (
+                df[selected_dma].str.lower().str.strip().str.replace("_", " ")
+            )
+            all_dma_values.update(df[selected_dma].dropna().unique())
+        # Clean and standardize Panel column if it exists and is selected
+        if selected_panel and selected_panel != "N/A" and selected_panel in df.columns:
+            df[selected_panel] = (
+                df[selected_panel].str.lower().str.strip().str.replace("_", " ")
+            )
+            all_panel_values.update(df[selected_panel].dropna().unique())
+        # Update the processed DataFrame back in the dictionary
+        files_dict[file_name]["df"] = df
+    return all_dma_values, all_panel_values
+# Function to format values for display
+st.cache_resource(show_spinner=False)
+def format_values_for_display(values_list):
+    # Capitalize the first letter of each word and replace underscores with spaces
+    formatted_list = [value.replace("_", " ").title() for value in values_list]
+    # Join values with commas and 'and' before the last value
+    if len(formatted_list) > 1:
+        return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
+    elif formatted_list:
+        return formatted_list[0]
+    return "No values available"
+# Function to normalizes all data within files_dict to a daily granularity
+st.cache(show_spinner=False, allow_output_mutation=True)
+def standardize_data_to_daily(files_dict, selections):
+    # Normalize all data to a daily granularity using a provided function
+    files_dict = apply_granularity_to_all(files_dict, "daily", selections)
+    # Update the "interval" attribute for each dataset to indicate the new granularity
+    for files_name, files_data in files_dict.items():
+        files_data["interval"] = "daily"
+    return files_dict
+# Function to apply granularity transformation to all DataFrames in files_dict
+st.cache_resource(show_spinner=False)
+def apply_granularity_to_all(files_dict, granularity_selection, selections):
+    for file_name, file_data in files_dict.items():
+        df = file_data["df"].copy()
+        # Handling when DMA or Panel might be 'N/A'
+        selected_dma = selections[file_name].get("DMA")
+        selected_panel = selections[file_name].get("Panel")
+        # Correcting the segment selection logic & handling 'N/A'
+        if selected_dma != "N/A" and selected_panel != "N/A":
+            unique_combinations = df[[selected_dma, selected_panel]].drop_duplicates()
+        elif selected_dma != "N/A":
+            unique_combinations = df[[selected_dma]].drop_duplicates()
+            selected_panel = None  # Ensure Panel is ignored if N/A
+        elif selected_panel != "N/A":
+            unique_combinations = df[[selected_panel]].drop_duplicates()
+            selected_dma = None  # Ensure DMA is ignored if N/A
+        else:
+            # If both are 'N/A', process the entire dataframe as is
+            df = adjust_dataframe_granularity(
+                df, file_data["interval"], granularity_selection
             )
+            files_dict[file_name]["df"] = df
+            continue  # Skip to the next file
+        transformed_segments = []
+        for _, combo in unique_combinations.iterrows():
+            if selected_dma and selected_panel:
+                segment = df[
+                    (df[selected_dma] == combo[selected_dma])
+                    & (df[selected_panel] == combo[selected_panel])
+                ]
+            elif selected_dma:
+                segment = df[df[selected_dma] == combo[selected_dma]]
+            elif selected_panel:
+                segment = df[df[selected_panel] == combo[selected_panel]]
+            # Adjust granularity of the segment
+            transformed_segment = adjust_dataframe_granularity(
+                segment, file_data["interval"], granularity_selection
+            )
+            transformed_segments.append(transformed_segment)
+        # Combine all transformed segments into a single DataFrame for this file
+        transformed_df = pd.concat(transformed_segments, ignore_index=True)
+        files_dict[file_name]["df"] = transformed_df
+    return files_dict
+# Function to create main dataframe structure
+st.cache_resource(show_spinner=False)
+def create_main_dataframe(
+    files_dict, all_dma_values, all_panel_values, granularity_selection
+):
+    # Determine the global start and end dates across all DataFrames
+    global_start = min(df["df"]["date"].min() for df in files_dict.values())
+    global_end = max(df["df"]["date"].max() for df in files_dict.values())
+    # Adjust the date_range generation based on the granularity_selection
+    if granularity_selection == "weekly":
+        # Generate a weekly range, with weeks starting on Monday
+        date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
+    elif granularity_selection == "monthly":
+        # Generate a monthly range, starting from the first day of each month
+        date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
+    else:  # Default to daily if not weekly or monthly
+        date_range = pd.date_range(start=global_start, end=global_end, freq="D")
+    # Collect all unique DMA and Panel values, excluding 'N/A'
+    all_dmas = all_dma_values
+    all_panels = all_panel_values
+    # Dynamically build the list of dimensions (Panel, DMA) to include in the main DataFrame based on availability
+    dimensions, merge_keys = [], []
+    if all_panels:
+        dimensions.append(all_panels)
+        merge_keys.append("Panel")
+    if all_dmas:
+        dimensions.append(all_dmas)
+        merge_keys.append("DMA")
+    dimensions.append(date_range)  # Date range is always included
+    merge_keys.append("date")  # Date range is always included
+    # Create a main DataFrame template with the dimensions
+    main_df = pd.MultiIndex.from_product(
+        dimensions,
+        names=[name for name, _ in zip(merge_keys, dimensions)],
+    ).to_frame(index=False)
+    return main_df.reset_index(drop=True)
+# Function to prepare and merge dataFrames
+st.cache_resource(show_spinner=False)
+def merge_into_main_df(main_df, files_dict, selections):
+    for file_name, file_data in files_dict.items():
+        df = file_data["df"].copy()
+        # Rename selected DMA and Panel columns if not 'N/A'
+        selected_dma = selections[file_name].get("DMA", "N/A")
+        selected_panel = selections[file_name].get("Panel", "N/A")
+        if selected_dma != "N/A":
+            df.rename(columns={selected_dma: "DMA"}, inplace=True)
+        if selected_panel != "N/A":
+            df.rename(columns={selected_panel: "Panel"}, inplace=True)
+        # Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel' and 'DMA'
+        merge_keys = ["date"]
+        if "Panel" in df.columns:
+            merge_keys.append("Panel")
+        if "DMA" in df.columns:
+            merge_keys.append("DMA")
+        main_df = pd.merge(main_df, df, on=merge_keys, how="left")
+    # After all merges, sort by 'date' and reset index for cleanliness
+    sort_by = ["date"]
+    if "Panel" in main_df.columns:
+        sort_by.append("Panel")
+    if "DMA" in main_df.columns:
+        sort_by.append("DMA")
+    main_df.sort_values(by=sort_by, inplace=True)
+    main_df.reset_index(drop=True, inplace=True)
+    return main_df
+# Function to categorize column
+def categorize_column(column_name):
+    # Define keywords for each category
+    internal_keywords = [
+        "Price",
+        "Discount",
+        "product_price",
+        "cost",
+        "margin",
+        "inventory",
+        "sales",
+        "revenue",
+        "turnover",
+        "expense",
+    ]
+    exogenous_keywords = [
+        "GDP",
+        "Tax",
+        "Inflation",
+        "interest_rate",
+        "employment_rate",
+        "exchange_rate",
+        "consumer_spending",
+        "retail_sales",
+        "oil_prices",
+        "weather",
+    ]
+    # Check if the column name matches any of the keywords for Internal or Exogenous categories
+    for keyword in internal_keywords:
+        if keyword.lower() in column_name.lower():
+            return "Internal"
+    for keyword in exogenous_keywords:
+        if keyword.lower() in column_name.lower():
+            return "Exogenous"
+    # Default to Media if no match found
+    return "Media"
+# Function to calculate missing stats and prepare for editable DataFrame
+st.cache_resource(show_spinner=False)
+def prepare_missing_stats_df(df):
+    missing_stats = []
+    for column in df.columns:
+        if (
+            column == "date" or column == "DMA" or column == "Panel"
+        ):  # Skip Date, DMA and Panel column
+            continue
+        missing = df[column].isnull().sum()
+        pct_missing = round((missing / len(df)) * 100, 2)
+        # Dynamically assign category based on column name
+        # category = categorize_column(column)
+        category = "Media"
+        missing_stats.append(
+            {
+                "Column": column,
+                "Missing Values": missing,
+                "Missing Percentage": pct_missing,
+                "Impute Method": "Fill with 0",  # Default value
+                "Category": category,
+            }
+        )
+    stats_df = pd.DataFrame(missing_stats)
+    return stats_df
+# Function to add API DataFrame details to the files dictionary
+st.cache_resource(show_spinner=False)
+def add_api_dataframe_to_dict(main_df, files_dict):
+    files_dict["API"] = {
+        "numeric": list(main_df.select_dtypes(include=["number"]).columns),
+        "non_numeric": [
+            col
+            for col in main_df.select_dtypes(exclude=["number"]).columns
+            if col.lower() != "date"
+        ],
+        "interval": determine_data_interval(
+            pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
+        ),
+        "df": main_df,
+    }
+    return files_dict
+# Function to reads an API into a DataFrame, parsing specified columns as datetime
+@st.cache_resource(show_spinner=False)
+def read_API_data():
+    return pd.read_excel(r"upf_data_converted.xlsx", parse_dates=["Date"])
+# Function to set the 'DMA_Panel_Selected' session state variable to False
+def set_DMA_Panel_Selected_false():
+    st.session_state["DMA_Panel_Selected"] = False
+# Initialize 'final_df' in session state
+if "final_df" not in st.session_state:
+    st.session_state["final_df"] = pd.DataFrame()
+# Initialize 'bin_dict' in session state
+if "bin_dict" not in st.session_state:
+    st.session_state["bin_dict"] = {}
+# Initialize 'DMA_Panel_Selected' in session state
+if "DMA_Panel_Selected" not in st.session_state:
+    st.session_state["DMA_Panel_Selected"] = False
+# Page Title
+st.write("")  # Top padding
+st.title("Data Import")
+#########################################################################################################################################################
+# Create a dictionary to hold all DataFrames and collect user input to specify "DMA" and "Panel" columns for each file
+#########################################################################################################################################################
+# Read the Excel file, parsing 'Date' column as datetime
+main_df = read_API_data()
+# Convert all column names to lowercase
+main_df.columns = main_df.columns.str.lower().str.strip()
+# File uploader
+uploaded_files = st.file_uploader(
+    "Upload additional data",
+    type=["xlsx"],
+    accept_multiple_files=True,
+    on_change=set_DMA_Panel_Selected_false,
+)
+# Custom HTML for upload instructions
+recommendation_html = f"""
+<div style="text-align: justify;">
+<strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including DMA, Panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
+</div>
+"""
+st.markdown(recommendation_html, unsafe_allow_html=True)
+# Choose Date Granularity
+st.markdown("#### Choose Date Granularity")
+# Granularity Selection
+granularity_selection = st.selectbox(
+    "Choose Date Granularity",
+    ["Daily", "Weekly", "Monthly"],
+    label_visibility="collapsed",
+    on_change=set_DMA_Panel_Selected_false,
+)
+granularity_selection = str(granularity_selection).lower()
+# Convert files to dataframes
+files_dict = files_to_dataframes(uploaded_files)
+# Add API Dataframe
+if main_df is not None:
+    files_dict = add_api_dataframe_to_dict(main_df, files_dict)
+# Display a warning message if no files have been uploaded and halt further execution
+if not files_dict:
+    st.warning(
+        "Please upload at least one file to proceed.",
+        icon="⚠️",
+    )
+    st.stop()  # Halts further execution until file is uploaded
+# Select DMA and Panel columns
+st.markdown("#### Select DMA and Panel columns")
+selections = {}
+with st.expander("Select DMA and Panel columns", expanded=False):
+    count = 0  # Initialize counter to manage the visibility of labels and keys
+    for file_name, file_data in files_dict.items():
+        # Determine visibility of the label based on the count
+        if count == 0:
+            label_visibility = "visible"
+        else:
+            label_visibility = "collapsed"
+        # Extract non-numeric columns
+        non_numeric_cols = file_data["non_numeric"]
+        # Prepare DMA and Panel values for dropdown, adding "N/A" as an option
+        dma_values = non_numeric_cols + ["N/A"]
+        panel_values = non_numeric_cols + ["N/A"]
+        # Skip if only one option is available
+        if len(dma_values) == 1 and len(panel_values) == 1:
+            selected_dma, selected_panel = "N/A", "N/A"
+            # Update the selections for DMA and Panel for the current file
+            selections[file_name] = {
+                "DMA": selected_dma,
+                "Panel": selected_panel,
+            }
+            continue
+        # Create layout columns for File Name, DMA, and Panel selections
+        file_name_col, DMA_col, Panel_col = st.columns([2, 4, 4])
+        with file_name_col:
+            # Display "File Name" label only for the first file
+            if count == 0:
+                st.write("File Name")
             else:
+                st.write("")
+            st.write(file_name)  # Display the file name
+        with DMA_col:
+            # Display a selectbox for DMA values
+            selected_dma = st.selectbox(
+                "Select DMA",
+                dma_values,
+                on_change=set_DMA_Panel_Selected_false,
+                label_visibility=label_visibility,  # Control visibility of the label
+                key=f"DMA_selectbox{count}",  # Ensure unique key for each selectbox
+            )
+        with Panel_col:
+            # Display a selectbox for Panel values
+            selected_panel = st.selectbox(
+                "Select Panel",
+                panel_values,
+                on_change=set_DMA_Panel_Selected_false,
+                label_visibility=label_visibility,  # Control visibility of the label
+                key=f"Panel_selectbox{count}",  # Ensure unique key for each selectbox
+            )
+        # Skip processing if the same column is selected for both Panel and DMA due to potential data integrity issues
+        if selected_panel == selected_dma and not (
+            selected_panel == "N/A" and selected_dma == "N/A"
+        ):
+            st.warning(
+                f"File: {file_name} → The same column cannot serve as both Panel and DMA. Please adjust your selections.",
+            )
+            selected_dma, selected_panel = "N/A", "N/A"
+            st.stop()
+        # Update the selections for DMA and Panel for the current file
+        selections[file_name] = {
+            "DMA": selected_dma,
+            "Panel": selected_panel,
+        }
+        count += 1  # Increment the counter after processing each file
+    # Accept DMA and Panel selection
+    if st.button("Accept and Process", use_container_width=True):
+        # Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
+        with st.spinner("Processing...", cache=True):
+            files_dict = standardize_data_to_daily(files_dict, selections)
+            # Convert all data to daily level granularity
+            files_dict = apply_granularity_to_all(
+                files_dict, granularity_selection, selections
             )
+        st.session_state["files_dict"] = files_dict
+        st.session_state["DMA_Panel_Selected"] = True
+#########################################################################################################################################################
+# Display unique DMA and Panel values
+#########################################################################################################################################################
+# Halts further execution until DMA and Panel columns are selected
+if "files_dict" in st.session_state and st.session_state["DMA_Panel_Selected"]:
+    files_dict = st.session_state["files_dict"]
+else:
+    st.stop()
+# Set to store unique values of DMA and Panel
+with st.spinner("Fetching DMA and Panel values..."):
+    all_dma_values, all_panel_values = clean_and_extract_unique_values(
+        files_dict, selections
+    )
+    # List of DMA and Panel columns unique values
+    list_of_all_dma_values = list(all_dma_values)
+    list_of_all_panel_values = list(all_panel_values)
+    # Format DMA and Panel values for display
+    formatted_dma_values = format_values_for_display(list_of_all_dma_values)
+    formatted_panel_values = format_values_for_display(list_of_all_panel_values)
+# Unique DMA and Panel values
+st.markdown("#### Unique DMA and Panel values")
+# Display DMA and Panel values
+with st.expander("Unique DMA and Panel values"):
+    st.write("")
+    st.markdown(
+        f"""
+    <style>
+    .justify-text {{
+    text-align: justify;
+    }}
+    </style>
+    <div class="justify-text">
+    <strong>Panel Values:</strong> {formatted_panel_values}<br>
+    <strong>DMA Values:</strong> {formatted_dma_values}
+    </div>
+    """,
+        unsafe_allow_html=True,
     )
+    # Display total DMA and Panel
+    st.write("")
+    st.markdown(
+        f"""
+    <div style="text-align: justify;">
+        <strong>Number of DMAs detected:</strong> {len(list_of_all_dma_values)}<br>
+        <strong>Number of Panels detected:</strong> {len(list_of_all_panel_values)}
+    </div>
+    """,
+        unsafe_allow_html=True,
+    )
+    st.write("")
+#########################################################################################################################################################
+# Merge all DataFrames
+#########################################################################################################################################################
+# Merge all DataFrames selected
+main_df = create_main_dataframe(
+    files_dict, all_dma_values, all_panel_values, granularity_selection
+)
+merged_df = merge_into_main_df(main_df, files_dict, selections)
+# # Display the merged DataFrame
+# st.markdown("#### Merged DataFrame based on selected DMA and Panel")
+# st.dataframe(merged_df)
+#########################################################################################################################################################
+# Categorize Variables and Impute Missing Values
+#########################################################################################################################################################
+# Create an editable DataFrame in Streamlit
+st.markdown("#### Select Variables Category & Impute Missing Values")
+# Prepare missing stats DataFrame for editing
+missing_stats_df = prepare_missing_stats_df(merged_df)
+edited_stats_df = st.data_editor(
+    missing_stats_df,
+    column_config={
+        "Impute Method": st.column_config.SelectboxColumn(
+            options=[
+                "Drop Column",
+                "Fill with Mean",
+                "Fill with Median",
+                "Fill with 0",
+            ],
+            required=True,
+            default="Fill with 0",
+        ),
+        "Category": st.column_config.SelectboxColumn(
+            options=[
+                "Media",
+                "Exogenous",
+                "Internal",
+                "Response_Metric"
+            ],
+            required=True,
+            default="Media",
+        ),
+    },
+    disabled=["Column", "Missing Values", "Missing Percentage"],
+    hide_index=True,
+    use_container_width=True,
+)
+# Apply changes based on edited DataFrame
+for i, row in edited_stats_df.iterrows():
+    column = row["Column"]
+    if row["Impute Method"] == "Drop Column":
+        merged_df.drop(columns=[column], inplace=True)
+    elif row["Impute Method"] == "Fill with Mean":
+        merged_df[column].fillna(merged_df[column].mean(), inplace=True)
+    elif row["Impute Method"] == "Fill with Median":
+        merged_df[column].fillna(merged_df[column].median(), inplace=True)
+    elif row["Impute Method"] == "Fill with 0":
+        merged_df[column].fillna(0, inplace=True)
+# Display the Final DataFrame and exogenous variables
+st.markdown("#### Final DataFrame")
+final_df = merged_df
+st.dataframe(final_df, hide_index=True)
+# Initialize an empty dictionary to hold categories and their variables
+category_dict = {}
+# Iterate over each row in the edited DataFrame to populate the dictionary
+for i, row in edited_stats_df.iterrows():
+    column = row["Column"]
+    category = row["Category"]  # The category chosen by the user for this variable
+    # Check if the category already exists in the dictionary
+    if category not in category_dict:
+        # If not, initialize it with the current column as its first element
+        category_dict[category] = [column]
+    else:
+        # If it exists, append the current column to the list of variables under this category
+        category_dict[category].append(column)
+# Add Date, DMA and Panel in category dictionary
+category_dict.update({"Date": ["date"]})
+if "DMA" in final_df.columns:
+    category_dict["DMA"] = ["DMA"]
+if "Panel" in final_df.columns:
+    category_dict["Panel"] = ["Panel"]
+# Display the dictionary
+st.markdown("#### Variable Category")
+for category, variables in category_dict.items():
+    # Check if there are multiple variables to handle "and" insertion correctly
+    if len(variables) > 1:
+        # Join all but the last variable with ", ", then add " and " before the last variable
+        variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
+    else:
+        # If there's only one variable, no need for "and"
+        variables_str = variables[0]
+    # Display the category and its variables in the desired format
+    st.markdown(
+        f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
+        unsafe_allow_html=True,
+    )
+# Store final dataframe and bin dictionary into session state
+st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
+if st.button('Save Changes'):
+    with open("Pickle_files/main_df", 'wb') as f:
+        pickle.dump(st.session_state["final_df"], f)
+    with open("Pickle_files/category_dict",'wb') as c:
+        pickle.dump(st.session_state["bin_dict"],c)
+    st.success('Changes Saved!')