Spaces:
Runtime error
Runtime error
# ========== (c) JP Hwang 30/9/2022 ========== | |
import logging | |
import pandas as pd | |
from pathlib import Path | |
import streamlit as st | |
import app | |
from sklearn.decomposition import PCA, FastICA, LatentDirichletAllocation | |
from sklearn.manifold import TSNE | |
import plotly.express as px | |
# ===== SET UP LOGGER ===== | |
logger = logging.getLogger(__name__) | |
root_logger = logging.getLogger() | |
root_logger.setLevel(logging.INFO) | |
sh = logging.StreamHandler() | |
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
sh.setFormatter(formatter) | |
root_logger.addHandler(sh) | |
# ===== END LOGGER SETUP ===== | |
desired_width = 320 | |
pd.set_option('display.max_columns', 20) | |
pd.set_option('display.width', desired_width) | |
sizes = [1, 20, 30] | |
def preproc_data(datadir=None): | |
if datadir is None: | |
datadir = Path('data') | |
df = pd.read_csv(datadir/'colors.csv', names=['simple_name', 'name', 'hex', 'r', 'g', 'b']) | |
# Preprocessing | |
df['rgb'] = df.apply(lambda x: f'rgb({x.r}, {x.g}, {x.b})', axis=1) | |
# Get top 'basic' color names | |
df = df.assign(category=df.simple_name.apply(lambda x: x.split('_')[-1])) | |
# Set default size attribute | |
df['size'] = sizes[0] | |
return df | |
def build_chart(df_in): | |
fig = px.scatter_3d(df_in, x='r', y='g', z='b', | |
template='plotly_white', | |
color=df_in['simple_name'], | |
color_discrete_sequence=df_in['rgb'], | |
size='size', | |
hover_data=['name']) | |
fig.update_layout( | |
showlegend=False, | |
margin=dict(l=5, r=5, t=20, b=5) | |
) | |
return fig | |
def main(): | |
st.title("Dimensionality Reduction: Explained") | |
st.subheader("Using colors to explain how dimensionality reduction techniques work in princple.") | |
st.markdown(""" | |
[Previously](https://huggingface.co/spaces/jphwang/colorful_vectors), you saw how vectors can represent information as sets of numbers. | |
To recap, colors can be represented (for example) as three numbers like `(255, 0, 255)`, | |
or `(80, 200, 120)`, where each number means something. | |
In this case they were (R)ed, (G)reen and (B)lue values, but lots of other systems exist like | |
[HCL or HSV](https://en.wikipedia.org/wiki/HSL_and_HSV), [CMYK](https://en.wikipedia.org/wiki/CMYK_color_model) | |
or [RGBA](https://en.wikipedia.org/wiki/RGBA_color_model). | |
Now, you might have noticed that some of these representations include more than three numbers. And given that | |
we live in a three-dimensional space, we can't easily represent 4+ dimensional vectors. So, what're we to do? | |
One answer is to use what are called dimensionality reduction techniques, which will produce an output like the below. | |
""") | |
df = preproc_data(datadir=Path('data')) | |
algo_options = [PCA, FastICA, LatentDirichletAllocation, TSNE] | |
algo_names = [str(i.__name__) for i in algo_options] | |
algos_dict = dict(zip(algo_names, algo_options)) | |
algo_sel = st.radio(label='Select your algorithm', options=algos_dict, index=0) | |
rgb_arr = df[['r', 'g', 'b']].values | |
algo = algos_dict[algo_sel] | |
if algo != TSNE: | |
reducer = algo(n_components=2) | |
reducer.fit(rgb_arr.transpose()) | |
vals = reducer.components_.transpose() | |
else: | |
reducer = algo(n_components=2) | |
vals = reducer.fit_transform(rgb_arr) | |
df['dimension_a'] = vals[:, 0] | |
df['dimension_b'] = vals[:, 1] | |
red_fig = px.scatter(df, x='dimension_a', y='dimension_b', | |
title=f'RGB values represented in 2-D using {algo.__name__}', | |
template='plotly_white', | |
color=df['simple_name'], | |
color_discrete_sequence=df['rgb'], | |
width=500, height=500, | |
size='size', | |
size_max=10, | |
hover_data=['name']) | |
red_fig.update_layout( | |
showlegend=False, | |
paper_bgcolor='white', | |
plot_bgcolor='white', | |
margin=dict(l=5, r=5, t=40, b=5) | |
) | |
st.markdown('-----') | |
st.plotly_chart(red_fig) | |
st.markdown('-----') | |
st.markdown(""" | |
For reference and as a reminder, here is the original data in 3-D again: | |
""") | |
fig = build_chart(df) | |
st.plotly_chart(fig, use_container_width=True) | |
st.markdown('-----') | |
st.markdown(""" | |
See how the 3-dimensional color information was squeezed into two? | |
We can visually confirm that the algorithms have done something *sensible*, because similar colors | |
still appear together. | |
Having said that, obviously *some* information will get lost in converting information that's contained in | |
three dimensions to only two. | |
So these models make particular choices, which result in the outputs being different. | |
Some models like PCA aim to lose the least amount of overall information, while others like t-SNE | |
have specific goals, such as preserving local distances at the cost of compromsed global distances. | |
""") | |
return True | |
if __name__ == '__main__': | |
main() | |