File size: 15,034 Bytes
94bbd2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
# Importing necessary libraries
import streamlit as st
import pickle

st.set_page_config(
    page_title="Model Build",
    page_icon=":shark:",
    layout="wide",
    initial_sidebar_state="collapsed",
)

from utilities import load_authenticator
import numpy as np
import pandas as pd
from utilities import set_header, load_local_css

load_local_css("styles.css")
set_header()


for k, v in st.session_state.items():
    if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
        st.session_state[k] = v

authenticator = st.session_state.get('authenticator')
if authenticator is None:
    authenticator = load_authenticator()

name, authentication_status, username = authenticator.login('Login', 'main')
auth_status = st.session_state.get('authentication_status')

if auth_status == True:
    is_state_initiaized = st.session_state.get('initialized',False)
    if not is_state_initiaized:
        a=1
      

    # Function to expand dataframe to daily
    @st.cache_resource(show_spinner=False)
    def expand_to_daily(df, granularity, start_date, end_date):
        # Create a new DataFrame with a row for each day
        all_dates = pd.date_range(start=start_date, end=end_date, freq="D")
        daily_df = pd.DataFrame(all_dates, columns=["Date"])

        if granularity == "daily":
            # For daily data, simply merge to fill missing dates
            daily_df = daily_df.merge(df, on="Date", how="left")
        else:
            # For weekly or monthly, distribute values to daily rows
            for column in df.columns:
                if column != "Date":  # Skip 'Date' column
                    daily_df[column] = np.nan  # Initialize with NaNs

            # Group by the required frequency and distribute values
            freq = "W-MON" if granularity == "weekly" else "MS"
            for _, group in df.groupby(pd.Grouper(key="Date", freq=freq)):
                num_days = len(
                    pd.date_range(group["Date"].min(), group["Date"].max(), freq="D")
                )
                for column in group.columns:
                    if column == "Date":  # Skip 'Date' column
                        continue
                    value = group[column].sum() / num_days
                    date_range = pd.date_range(
                        group["Date"].min(), periods=num_days, freq="D"
                    )
                    daily_df.loc[daily_df["Date"].isin(date_range), column] = value

        return daily_df


    # Function to validate date column in dataframe
    def validate_date_column(df):
        try:
            # Attempt to convert the 'Date' column to datetime
            df["Date"] = pd.to_datetime(df["Date"], format="%d-%m-%Y")
            return True
        except:
            return False


    # Function to determine data interval
    def determine_data_interval(common_freq):
        if common_freq == 1:
            return "daily"
        elif common_freq == 7:
            return "weekly"
        elif 28 <= common_freq <= 31:
            return "monthly"
        else:
            return "irregular"


    # Function to convert and fill dates in dataframe
    def convert_and_fill_dates(df, start_date, end_date, interval):
        # Create a date range for the desired period
        all_dates = pd.date_range(start=start_date, end=end_date, freq="D")
        new_df = pd.DataFrame(all_dates, columns=["Date"])

        # Preprocess and aggregate data based on the original interval
        if interval != "daily":
            # Resample to start of each week/month, then sum values for the same period
            if interval == "weekly":
                df = df.resample("W-MON", on="Date").sum().reset_index()
            elif interval == "monthly":
                df = df.resample("MS", on="Date").sum().reset_index()

            # Distribute values equally across the days in each week/month
            expanded_rows = []
            for _, row in df.iterrows():
                if interval == "weekly":
                    period_dates = pd.date_range(row["Date"], periods=7)
                elif interval == "monthly":
                    period_end = row["Date"] + pd.offsets.MonthEnd(1)
                    period_dates = pd.date_range(row["Date"], period_end)

                for date in period_dates:
                    new_row = row.copy()
                    new_row["Date"] = date
                    for col in df.columns:
                        if col != "Date":  # Skip 'Date' column
                            new_row[col] = row[col] / len(period_dates)
                    expanded_rows.append(new_row)

            # Create a DataFrame from expanded rows
            expanded_df = pd.DataFrame(expanded_rows)
            new_df = pd.merge(new_df, expanded_df, how="left", on="Date")
        else:
            # Daily data, aggregate if there are multiple entries for the same day
            df = df.groupby("Date").sum().reset_index()
            new_df = pd.merge(new_df, df, how="left", on="Date")

        # Ensure all dates from start to end are present, filling missing values with NaN
        new_df["Date"] = pd.to_datetime(new_df["Date"])  # Ensure 'Date' is datetime type
        new_df = new_df.set_index("Date").reindex(all_dates).reset_index()
        new_df.rename(columns={"index": "Date"}, inplace=True)

        return new_df


    # Function to convert a DataFrame from daily level granularity to either weekly or monthly level
    def convert_to_higher_granularity(df, required_granularity):
        if required_granularity == "daily":
            return df

        # Ensure 'Date' is the index and is in datetime format
        if not pd.api.types.is_datetime64_any_dtype(df["Date"]):
            df["Date"] = pd.to_datetime(df["Date"])
        df.set_index("Date", inplace=True)

        # Resample and aggregate
        if required_granularity == "weekly":
            # Resample to weekly, using 'W-MON' to indicate weeks starting on Monday
            df = df.resample("W-MON").sum()
        elif required_granularity == "monthly":
            # Resample to monthly, using 'MS' to indicate month start
            df = df.resample("MS").sum()

        # Reset index to move 'Date' back to a column
        df.reset_index(inplace=True)

        return df


    # # Read the CSV file, parsing 'Date' column as datetime
    main_df = pd.read_csv("Media_data_for_model_dma_level.csv", dayfirst=True, parse_dates=["Date"])
    # st.write(main_df)

    # Get the start date (minimum) and end date (maximum) from the 'Date' column
    api_start_date = main_df["Date"].min()
    api_end_date = main_df["Date"].max()

    # Infer the granularity from the most common difference between consecutive dates
    date_diffs = main_df["Date"].diff().dt.days.dropna()
    common_diff = date_diffs.mode()[0]
    api_granularity = determine_data_interval(common_diff)

    # Convert the DataFrame to daily level granularity
    main_df = expand_to_daily(main_df, api_granularity, api_start_date, api_end_date)

    # Page Title
    st.title("Data Import")

    # File uploader
    uploaded_files = st.file_uploader(
        "Upload additional data", type=["xlsx"], accept_multiple_files=True
    )

    # Custom HTML for upload instructions
    recommendation_html = f"""
    <div style="text-align: justify;">
    <strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values, and aggregated to a {api_granularity} level.
    </div>
    """

    st.markdown(recommendation_html, unsafe_allow_html=True)

    # Initialize a list to collect all processed DataFrames
    all_data_dfs = []

    if uploaded_files:
        for uploaded_file in uploaded_files:
            # Extract the file name
            file_name = uploaded_file.name

            # Load the file into a DataFrame
            data_df = pd.read_excel(
                uploaded_file,
            )

            # Identify numeric columns in the DataFrame
            numeric_columns = data_df.select_dtypes(include="number").columns.tolist()

            # Validate the 'Date' column and ensure there's at least one numeric column
            if validate_date_column(data_df) and len(numeric_columns) > 0:
                data_df = data_df[["Date"] + numeric_columns]

                # Ensure the 'Date' column is in datetime format and sorted
                data_df["Date"] = pd.to_datetime(data_df["Date"], dayfirst=True)
                data_df.sort_values("Date", inplace=True)

                # Calculate the most common day difference between dates to determine frequency
                common_freq = data_df["Date"].diff().dt.days.dropna().mode()[0]

                # Calculate the data interval (daily, weekly, monthly or irregular)
                interval = determine_data_interval(common_freq)

                if interval == "irregular":
                    # Warn the user if the 'Date' column doesn't meet the format requirements
                    st.warning(
                        f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval."
                    )
                    continue

                # Convert data to specified interval and redistribute to daily
                data_df = convert_and_fill_dates(
                    data_df, api_start_date, api_end_date, interval
                )

                # Add the processed DataFrame to the list
                all_data_dfs.append(data_df)

            else:
                # Warn the user if the 'Date' column doesn't meet the format requirements
                st.warning(
                    f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column."
                )

    # Sequentially merge each of the other DataFrames with the main DataFrame on 'Date'
    for df in all_data_dfs:
        main_df = pd.merge(main_df, df, on="Date", how="left")


    # Function to calculate missing stats and prepare for editable DataFrame
    def prepare_missing_stats_df(df):
        missing_stats = []
        for column in df.columns:
            if (
                column == "Date" or column == "Total Approved Accounts - Revenue"
            ):  # Skip Date and Revenue column
                continue

            missing = df[column].isnull().sum()
            pct_missing = round((missing / len(df)) * 100, 2)
            missing_stats.append(
                {
                    "Column": column,
                    "Missing Values": missing,
                    "Missing Percentage": pct_missing,
                    "Impute Method": "Fill with 0",  # Default value
                    "Category": "Media",  # Default value
                }
            )
        stats_df = pd.DataFrame(missing_stats)
        return stats_df


    # Prepare missing stats DataFrame for editing
    missing_stats_df = prepare_missing_stats_df(main_df)

    # Create an editable DataFrame in Streamlit
    st.markdown("#### Select Variables Category & Impute Missing Values")

    edited_stats_df = st.data_editor(
        missing_stats_df,
        column_config={
            "Impute Method": st.column_config.SelectboxColumn(
                options=[
                    "Drop Column",
                    "Fill with Mean",
                    "Fill with Median",
                    "Fill with 0",
                ],
                required=True,
                default="Fill with 0",
            ),
            "Category": st.column_config.SelectboxColumn(
                options=[
                    "Date",
                    "Media",
                    "Exogenous",
                    "Internal",
                    "DMA/Panel",
                    "Response_Metric"
                ],
                required=True,
                default="Media",
            ),
        },
        disabled=["Column", "Missing Values", "Missing Percentage"],
        hide_index=True,
        use_container_width=True,
    )


    # Apply changes based on edited DataFrame
    for i, row in edited_stats_df.iterrows():
        column = row["Column"]
        if row["Impute Method"] == "Drop Column":
            main_df.drop(columns=[column], inplace=True)

        elif row["Impute Method"] == "Fill with Mean":
            main_df[column].fillna(main_df[column].mean(), inplace=True)

        elif row["Impute Method"] == "Fill with Median":
            main_df[column].fillna(main_df[column].median(), inplace=True)

        elif row["Impute Method"] == "Fill with 0":
            main_df[column].fillna(0, inplace=True)


    # Convert the Final DataFrame to required granularity
    main_df = convert_to_higher_granularity(main_df, api_granularity)

    # Display the Final DataFrame and exogenous variables
    st.markdown("#### Final DataFrame:")
    st.dataframe(main_df)
    
    

    # Initialize an empty dictionary to hold categories and their variables
    category_dict = {}

    # Iterate over each row in the edited DataFrame to populate the dictionary
    for i, row in edited_stats_df.iterrows():
        column = row["Column"]
        category = row["Category"]  # The category chosen by the user for this variable

        # Check if the category already exists in the dictionary
        if category not in category_dict:
            # If not, initialize it with the current column as its first element
            category_dict[category] = [column]
        else:
            # If it exists, append the current column to the list of variables under this category
            category_dict[category].append(column)

    # Display the dictionary
    st.markdown("#### Variable Category:")
    for category, variables in category_dict.items():
        # Check if there are multiple variables to handle "and" insertion correctly
        if len(variables) > 1:
            # Join all but the last variable with ", ", then add " and " before the last variable
            variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
        else:
            # If there's only one variable, no need for "and"
            variables_str = variables[0]

        # Display the category and its variables in the desired format
        st.markdown(f"**{category}:** {variables_str}\n\n", unsafe_allow_html=True)

    # storing maindf and categories in session_state
    # st.write(main_df)
        
            
    # st.session_state['Cleaned_data']=main_df

    # st.session_state['category_dict']=category_dict
    if st.button('Save Changes'):
        
        with open("Pickle_files/main_df", 'wb') as f:
            pickle.dump(main_df, f)
        with open("Pickle_files/category_dict",'wb') as c:
            pickle.dump(category_dict,c)
        st.success('Changes Saved!')