|
import streamlit as st |
|
import pandas as pd |
|
from Eda_functions import * |
|
import numpy as np |
|
import pickle |
|
import streamlit as st |
|
from utilities import set_header, load_local_css,update_db,project_selection |
|
import os |
|
import tempfile |
|
import sqlite3 |
|
from utilities import update_db |
|
import re |
|
|
|
st.set_page_config( |
|
page_title="Data Validation", |
|
page_icon=":shark:", |
|
layout="wide", |
|
initial_sidebar_state="collapsed", |
|
) |
|
load_local_css("styles.css") |
|
set_header() |
|
|
|
if 'username' not in st.session_state: |
|
st.session_state['username']=None |
|
|
|
if "project_name" not in st.session_state: |
|
st.session_state["project_name"] = None |
|
|
|
if "project_dct" not in st.session_state: |
|
project_selection() |
|
|
|
if "project_path" not in st.session_state: |
|
st.stop() |
|
|
|
if 'username' in st.session_state and st.session_state['username'] is not None: |
|
data_path = os.path.join(st.session_state["project_path"], "data_import.pkl") |
|
|
|
try: |
|
with open(data_path, "rb") as f: |
|
data = pickle.load(f) |
|
except Exception as e: |
|
st.error(f"Please import data from the Data Import Page") |
|
st.stop() |
|
|
|
conn = sqlite3.connect(r"DB\User.db", check_same_thread=False) |
|
c = conn.cursor() |
|
st.session_state["cleaned_data"] = data["final_df"] |
|
st.session_state["category_dict"] = data["bin_dict"] |
|
|
|
cols1 = st.columns([2, 1]) |
|
|
|
with cols1[0]: |
|
st.markdown(f"**Welcome {st.session_state['username']}**") |
|
with cols1[1]: |
|
st.markdown( |
|
f"**Current Project: {st.session_state['project_name']}**" |
|
) |
|
|
|
|
|
|
|
st.title("Data Validation and Insights") |
|
|
|
|
|
target_variables = [ |
|
st.session_state["category_dict"][key] |
|
for key in st.session_state["category_dict"].keys() |
|
if key == "Response Metrics" |
|
] |
|
|
|
|
|
def format_display(inp): |
|
return inp.title().replace("_", " ").strip() |
|
|
|
|
|
target_variables = list(*target_variables) |
|
target_column = st.selectbox( |
|
"Select the Target Feature/Dependent Variable (will be used in all charts as reference)", |
|
target_variables, |
|
index=st.session_state["project_dct"]["data_validation"]["target_column"], |
|
format_func=format_display, |
|
) |
|
|
|
st.session_state["project_dct"]["data_validation"]["target_column"] = ( |
|
target_variables.index(target_column) |
|
) |
|
|
|
st.session_state["target_column"] = target_column |
|
|
|
|
|
|
|
|
|
if 'Panel_1' not in st.session_state["cleaned_data"].columns: |
|
st.session_state["cleaned_data"]['Panel_1']=['1']*len(st.session_state["cleaned_data"]) |
|
|
|
panels= st.session_state["cleaned_data"]['Panel_1'] |
|
disable=True |
|
|
|
else: |
|
|
|
disable=False |
|
|
|
|
|
selected_panels = st.multiselect( |
|
"Please choose the panels you wish to analyze.If no panels are selected, insights will be derived from the overall data.", |
|
st.session_state["cleaned_data"]["Panel_1"].unique(), |
|
default=st.session_state["project_dct"]["data_validation"]["selected_panels"], |
|
disabled=disable |
|
|
|
) |
|
|
|
st.session_state["project_dct"]["data_validation"]["selected_panels"] = selected_panels |
|
|
|
aggregation_dict = { |
|
item: "sum" if key == "Media" else "mean" |
|
for key, value in st.session_state["category_dict"].items() |
|
for item in value |
|
if item not in ["date", "Panel_1"] |
|
} |
|
|
|
aggregation_dict = {key: value for key, value in aggregation_dict.items() if key in st.session_state["cleaned_data"].columns} |
|
|
|
|
|
with st.expander("**Reponse Metric Analysis**"): |
|
|
|
if len(selected_panels) > 0: |
|
st.session_state["Cleaned_data_panel"] = st.session_state["cleaned_data"][ |
|
st.session_state["cleaned_data"]["Panel_1"].isin(selected_panels) |
|
] |
|
|
|
st.session_state["Cleaned_data_panel"] = ( |
|
st.session_state["Cleaned_data_panel"] |
|
.groupby(by="date") |
|
.agg(aggregation_dict) |
|
) |
|
st.session_state["Cleaned_data_panel"] = st.session_state[ |
|
"Cleaned_data_panel" |
|
].reset_index() |
|
else: |
|
|
|
st.session_state["Cleaned_data_panel"] = ( |
|
st.session_state["cleaned_data"].groupby(by="date").agg(aggregation_dict) |
|
) |
|
st.session_state["Cleaned_data_panel"] = st.session_state[ |
|
"Cleaned_data_panel" |
|
].reset_index() |
|
|
|
fig = line_plot_target( |
|
st.session_state["Cleaned_data_panel"], |
|
target=target_column, |
|
title=f"{target_column} Over Time", |
|
) |
|
st.plotly_chart(fig, use_container_width=True) |
|
|
|
media_channel = list( |
|
*[ |
|
st.session_state["category_dict"][key] |
|
for key in st.session_state["category_dict"].keys() |
|
if key == "Media" |
|
] |
|
) |
|
|
|
|
|
spends_features= list( |
|
*[ |
|
st.session_state["category_dict"][key] |
|
for key in st.session_state["category_dict"].keys() |
|
if key == "Spends" |
|
] |
|
) |
|
|
|
|
|
exo_var = list( |
|
*[ |
|
st.session_state["category_dict"][key] |
|
for key in st.session_state["category_dict"].keys() |
|
if key == "Exogenous" |
|
] |
|
) |
|
internal_var = list( |
|
*[ |
|
st.session_state["category_dict"][key] |
|
for key in st.session_state["category_dict"].keys() |
|
if key == "Internal" |
|
] |
|
) |
|
|
|
Non_media_variables = exo_var + internal_var |
|
|
|
st.markdown("### Annual Data Summary") |
|
|
|
summary_df = summary( |
|
st.session_state["Cleaned_data_panel"], |
|
media_channel + [target_column]+spends_features, |
|
spends=None, |
|
Target=True, |
|
) |
|
|
|
st.dataframe( |
|
summary_df.sort_index(axis=1), |
|
use_container_width=True, |
|
) |
|
|
|
if st.checkbox("Show raw data"): |
|
st.cache_resource(show_spinner=False) |
|
|
|
def raw_df_gen(): |
|
|
|
dates = pd.to_datetime(st.session_state["Cleaned_data_panel"]["date"]) |
|
|
|
|
|
raw_df = pd.concat( |
|
[ |
|
dates, |
|
st.session_state["Cleaned_data_panel"] |
|
.select_dtypes(np.number) |
|
.applymap(format_numbers), |
|
], |
|
axis=1, |
|
) |
|
|
|
|
|
sorted_raw_df = raw_df.sort_values(by="date", ascending=True) |
|
|
|
|
|
sorted_raw_df["date"] = sorted_raw_df["date"].dt.strftime("%m/%d/%Y") |
|
|
|
return sorted_raw_df |
|
|
|
|
|
st.dataframe(raw_df_gen()) |
|
|
|
col1 = st.columns(1) |
|
|
|
if "selected_feature" not in st.session_state: |
|
st.session_state["selected_feature"] = None |
|
|
|
|
|
with st.expander("Media Variables Analysis"): |
|
|
|
|
|
st.session_state["selected_feature"] = st.selectbox( |
|
"Select media", media_channel+spends_features, format_func=format_display |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
spends_col= st.columns(2) |
|
spends_feature = [ |
|
col |
|
for col in spends_features |
|
if re.split(r"_cost|_spend", col.lower())[0] |
|
in st.session_state["selected_feature"] |
|
] |
|
|
|
with spends_col[0]: |
|
if len(spends_feature) == 0: |
|
st.warning("No 'spends' variable available for the selected metric in the data. Please ensure the columns are properly named.or select them in the provided selecttion box") |
|
else: |
|
st.write(f'Selected "{spends_feature[0]}" as the corresponding spends variable automatically. If this is incorrect, please click the checkbox to change the variable.') |
|
|
|
with spends_col[1]: |
|
if len(spends_feature)==0 or st.checkbox('Select "Spends" variable for CPM and CPC calculation'): |
|
spends_feature=[st.selectbox('Spends variable',spends_features)] |
|
|
|
if "validation" not in st.session_state: |
|
|
|
st.session_state["validation"] = st.session_state["project_dct"][ |
|
"data_validation" |
|
]["validated_variables"] |
|
|
|
val_variables = [col for col in media_channel if col != "date"] |
|
|
|
if not set( |
|
st.session_state["project_dct"]["data_validation"]["validated_variables"] |
|
).issubset(set(val_variables)): |
|
|
|
st.session_state["validation"] = [] |
|
|
|
|
|
else: |
|
fig_row1 = line_plot( |
|
st.session_state["Cleaned_data_panel"], |
|
x_col="date", |
|
y1_cols=[st.session_state["selected_feature"]], |
|
y2_cols=[target_column], |
|
title=f'Analysis of {st.session_state["selected_feature"]} and {[target_column][0]} Over Time', |
|
) |
|
st.plotly_chart(fig_row1, use_container_width=True) |
|
st.markdown("### Summary") |
|
st.dataframe( |
|
summary( |
|
st.session_state["cleaned_data"], |
|
[st.session_state["selected_feature"]], |
|
spends=spends_feature[0], |
|
), |
|
use_container_width=True, |
|
) |
|
|
|
cols2 = st.columns(2) |
|
|
|
if len(set(st.session_state["validation"]).intersection(val_variables)) == len( |
|
val_variables |
|
): |
|
disable = True |
|
help = "All media variables are validated" |
|
else: |
|
disable = False |
|
help = "" |
|
|
|
with cols2[0]: |
|
if st.button("Validate", disabled=disable, help=help): |
|
st.session_state["validation"].append( |
|
st.session_state["selected_feature"] |
|
) |
|
with cols2[1]: |
|
|
|
if st.checkbox("Validate all", disabled=disable, help=help): |
|
st.session_state["validation"].extend(val_variables) |
|
st.success("All media variables are validated ✅") |
|
|
|
if len(set(st.session_state["validation"]).intersection(val_variables)) != len( |
|
val_variables |
|
): |
|
validation_data = pd.DataFrame( |
|
{ |
|
"Validate": [ |
|
(True if col in st.session_state["validation"] else False) |
|
for col in val_variables |
|
], |
|
"Variables": val_variables, |
|
} |
|
) |
|
|
|
sorted_validation_df = validation_data.sort_values( |
|
by="Variables", ascending=True, na_position="first" |
|
) |
|
cols3 = st.columns([1, 30]) |
|
with cols3[1]: |
|
validation_df = st.data_editor( |
|
sorted_validation_df, |
|
|
|
|
|
|
|
column_config={ |
|
"Validate": st.column_config.CheckboxColumn( |
|
default=False, |
|
width=100, |
|
), |
|
"Variables": st.column_config.TextColumn(width=1000), |
|
}, |
|
hide_index=True, |
|
) |
|
|
|
selected_rows = validation_df[validation_df["Validate"] == True][ |
|
"Variables" |
|
] |
|
|
|
|
|
|
|
st.session_state["validation"].extend(selected_rows) |
|
|
|
st.session_state["project_dct"]["data_validation"][ |
|
"validated_variables" |
|
] = st.session_state["validation"] |
|
|
|
not_validated_variables = [ |
|
col |
|
for col in val_variables |
|
if col not in st.session_state["validation"] |
|
] |
|
|
|
if not_validated_variables: |
|
not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}' |
|
st.warning(not_validated_message) |
|
|
|
|
|
with st.expander("Non Media Variables Analysis"): |
|
if len(Non_media_variables)==0: |
|
st.warning('Non media variables not present') |
|
|
|
else: |
|
selected_columns_row4 = st.selectbox( |
|
"Select Channel", |
|
Non_media_variables, |
|
format_func=format_display, |
|
index=st.session_state["project_dct"]["data_validation"]["Non_media_variables"], |
|
) |
|
|
|
st.session_state["project_dct"]["data_validation"]["Non_media_variables"] = ( |
|
Non_media_variables.index(selected_columns_row4) |
|
) |
|
|
|
|
|
fig_row4 = line_plot( |
|
st.session_state["Cleaned_data_panel"], |
|
x_col="date", |
|
y1_cols=[selected_columns_row4], |
|
y2_cols=[target_column], |
|
title=f"Analysis of {selected_columns_row4} and {target_column} Over Time", |
|
) |
|
st.plotly_chart(fig_row4, use_container_width=True) |
|
selected_non_media = selected_columns_row4 |
|
sum_df = st.session_state["Cleaned_data_panel"][ |
|
["date", selected_non_media, target_column] |
|
] |
|
sum_df["Year"] = pd.to_datetime( |
|
st.session_state["Cleaned_data_panel"]["date"] |
|
).dt.year |
|
|
|
|
|
print(sum_df) |
|
sum_df = sum_df.drop("date", axis=1).groupby("Year").agg("sum") |
|
sum_df.loc["Grand Total"] = sum_df.sum() |
|
sum_df = sum_df.applymap(format_numbers) |
|
sum_df.fillna("-", inplace=True) |
|
sum_df = sum_df.replace({"0.0": "-", "nan": "-"}) |
|
st.markdown("### Summary") |
|
st.dataframe(sum_df, use_container_width=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with st.expander("Correlation Analysis"): |
|
options = list( |
|
st.session_state["Cleaned_data_panel"].select_dtypes(np.number).columns |
|
) |
|
|
|
selected_options = st.multiselect( |
|
"Select Variables For correlation plot", |
|
[var for var in options if var != target_column], |
|
default=options[3], |
|
) |
|
|
|
st.pyplot( |
|
correlation_plot( |
|
st.session_state["Cleaned_data_panel"], |
|
selected_options, |
|
target_column, |
|
) |
|
) |
|
|
|
if st.button("Save Changes", use_container_width=True): |
|
|
|
update_db("2_Data_Validation.py") |
|
|
|
project_dct_path = os.path.join(st.session_state["project_path"], "project_dct.pkl") |
|
|
|
with open(project_dct_path, "wb") as f: |
|
pickle.dump(st.session_state["project_dct"], f) |
|
st.success("Changes saved") |
|
|