JP Hwang
First working version
f1a15ae
# ========== (c) JP Hwang 30/9/2022 ==========
import logging
import pandas as pd
from pathlib import Path
import streamlit as st
import app
from sklearn.decomposition import PCA, FastICA, LatentDirichletAllocation
from sklearn.manifold import TSNE
import plotly.express as px
# ===== SET UP LOGGER =====
logger = logging.getLogger(__name__)
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
sh = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
sh.setFormatter(formatter)
root_logger.addHandler(sh)
# ===== END LOGGER SETUP =====
desired_width = 320
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', desired_width)
sizes = [1, 20, 30]
def preproc_data(datadir=None):
if datadir is None:
datadir = Path('data')
df = pd.read_csv(datadir/'colors.csv', names=['simple_name', 'name', 'hex', 'r', 'g', 'b'])
# Preprocessing
df['rgb'] = df.apply(lambda x: f'rgb({x.r}, {x.g}, {x.b})', axis=1)
# Get top 'basic' color names
df = df.assign(category=df.simple_name.apply(lambda x: x.split('_')[-1]))
# Set default size attribute
df['size'] = sizes[0]
return df
def build_chart(df_in):
fig = px.scatter_3d(df_in, x='r', y='g', z='b',
template='plotly_white',
color=df_in['simple_name'],
color_discrete_sequence=df_in['rgb'],
size='size',
hover_data=['name'])
fig.update_layout(
showlegend=False,
margin=dict(l=5, r=5, t=20, b=5)
)
return fig
def main():
st.title("Dimensionality Reduction: Explained")
st.subheader("Using colors to explain how dimensionality reduction techniques work in princple.")
st.markdown("""
[Previously](https://huggingface.co/spaces/jphwang/colorful_vectors), you saw how vectors can represent information as sets of numbers.
To recap, colors can be represented (for example) as three numbers like `(255, 0, 255)`,
or `(80, 200, 120)`, where each number means something.
In this case they were (R)ed, (G)reen and (B)lue values, but lots of other systems exist like
[HCL or HSV](https://en.wikipedia.org/wiki/HSL_and_HSV), [CMYK](https://en.wikipedia.org/wiki/CMYK_color_model)
or [RGBA](https://en.wikipedia.org/wiki/RGBA_color_model).
Now, you might have noticed that some of these representations include more than three numbers. And given that
we live in a three-dimensional space, we can't easily represent 4+ dimensional vectors. So, what're we to do?
One answer is to use what are called dimensionality reduction techniques, which will produce an output like the below.
""")
df = preproc_data(datadir=Path('data'))
algo_options = [PCA, FastICA, LatentDirichletAllocation, TSNE]
algo_names = [str(i.__name__) for i in algo_options]
algos_dict = dict(zip(algo_names, algo_options))
algo_sel = st.radio(label='Select your algorithm', options=algos_dict, index=0)
rgb_arr = df[['r', 'g', 'b']].values
algo = algos_dict[algo_sel]
if algo != TSNE:
reducer = algo(n_components=2)
reducer.fit(rgb_arr.transpose())
vals = reducer.components_.transpose()
else:
reducer = algo(n_components=2)
vals = reducer.fit_transform(rgb_arr)
df['dimension_a'] = vals[:, 0]
df['dimension_b'] = vals[:, 1]
red_fig = px.scatter(df, x='dimension_a', y='dimension_b',
title=f'RGB values represented in 2-D using {algo.__name__}',
template='plotly_white',
color=df['simple_name'],
color_discrete_sequence=df['rgb'],
width=500, height=500,
size='size',
size_max=10,
hover_data=['name'])
red_fig.update_layout(
showlegend=False,
paper_bgcolor='white',
plot_bgcolor='white',
margin=dict(l=5, r=5, t=40, b=5)
)
st.markdown('-----')
st.plotly_chart(red_fig)
st.markdown('-----')
st.markdown("""
For reference and as a reminder, here is the original data in 3-D again:
""")
fig = build_chart(df)
st.plotly_chart(fig, use_container_width=True)
st.markdown('-----')
st.markdown("""
See how the 3-dimensional color information was squeezed into two?
We can visually confirm that the algorithms have done something *sensible*, because similar colors
still appear together.
Having said that, obviously *some* information will get lost in converting information that's contained in
three dimensions to only two.
So these models make particular choices, which result in the outputs being different.
Some models like PCA aim to lose the least amount of overall information, while others like t-SNE
have specific goals, such as preserving local distances at the cost of compromsed global distances.
""")
return True
if __name__ == '__main__':
main()