# ========== (c) JP Hwang 30/9/2022 ========== import logging import pandas as pd from pathlib import Path import streamlit as st import app from sklearn.decomposition import PCA, FastICA, LatentDirichletAllocation from sklearn.manifold import TSNE import plotly.express as px # ===== SET UP LOGGER ===== logger = logging.getLogger(__name__) root_logger = logging.getLogger() root_logger.setLevel(logging.INFO) sh = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') sh.setFormatter(formatter) root_logger.addHandler(sh) # ===== END LOGGER SETUP ===== desired_width = 320 pd.set_option('display.max_columns', 20) pd.set_option('display.width', desired_width) sizes = [1, 20, 30] def preproc_data(datadir=None): if datadir is None: datadir = Path('data') df = pd.read_csv(datadir/'colors.csv', names=['simple_name', 'name', 'hex', 'r', 'g', 'b']) # Preprocessing df['rgb'] = df.apply(lambda x: f'rgb({x.r}, {x.g}, {x.b})', axis=1) # Get top 'basic' color names df = df.assign(category=df.simple_name.apply(lambda x: x.split('_')[-1])) # Set default size attribute df['size'] = sizes[0] return df def build_chart(df_in): fig = px.scatter_3d(df_in, x='r', y='g', z='b', template='plotly_white', color=df_in['simple_name'], color_discrete_sequence=df_in['rgb'], size='size', hover_data=['name']) fig.update_layout( showlegend=False, margin=dict(l=5, r=5, t=20, b=5) ) return fig def main(): st.title("Dimensionality Reduction: Explained") st.subheader("Using colors to explain how dimensionality reduction techniques work in princple.") st.markdown(""" [Previously](https://huggingface.co/spaces/jphwang/colorful_vectors), you saw how vectors can represent information as sets of numbers. To recap, colors can be represented (for example) as three numbers like `(255, 0, 255)`, or `(80, 200, 120)`, where each number means something. In this case they were (R)ed, (G)reen and (B)lue values, but lots of other systems exist like [HCL or HSV](https://en.wikipedia.org/wiki/HSL_and_HSV), [CMYK](https://en.wikipedia.org/wiki/CMYK_color_model) or [RGBA](https://en.wikipedia.org/wiki/RGBA_color_model). Now, you might have noticed that some of these representations include more than three numbers. And given that we live in a three-dimensional space, we can't easily represent 4+ dimensional vectors. So, what're we to do? One answer is to use what are called dimensionality reduction techniques, which will produce an output like the below. """) df = preproc_data(datadir=Path('data')) algo_options = [PCA, FastICA, LatentDirichletAllocation, TSNE] algo_names = [str(i.__name__) for i in algo_options] algos_dict = dict(zip(algo_names, algo_options)) algo_sel = st.radio(label='Select your algorithm', options=algos_dict, index=0) rgb_arr = df[['r', 'g', 'b']].values algo = algos_dict[algo_sel] if algo != TSNE: reducer = algo(n_components=2) reducer.fit(rgb_arr.transpose()) vals = reducer.components_.transpose() else: reducer = algo(n_components=2) vals = reducer.fit_transform(rgb_arr) df['dimension_a'] = vals[:, 0] df['dimension_b'] = vals[:, 1] red_fig = px.scatter(df, x='dimension_a', y='dimension_b', title=f'RGB values represented in 2-D using {algo.__name__}', template='plotly_white', color=df['simple_name'], color_discrete_sequence=df['rgb'], width=500, height=500, size='size', size_max=10, hover_data=['name']) red_fig.update_layout( showlegend=False, paper_bgcolor='white', plot_bgcolor='white', margin=dict(l=5, r=5, t=40, b=5) ) st.markdown('-----') st.plotly_chart(red_fig) st.markdown('-----') st.markdown(""" For reference and as a reminder, here is the original data in 3-D again: """) fig = build_chart(df) st.plotly_chart(fig, use_container_width=True) st.markdown('-----') st.markdown(""" See how the 3-dimensional color information was squeezed into two? We can visually confirm that the algorithms have done something *sensible*, because similar colors still appear together. Having said that, obviously *some* information will get lost in converting information that's contained in three dimensions to only two. So these models make particular choices, which result in the outputs being different. Some models like PCA aim to lose the least amount of overall information, while others like t-SNE have specific goals, such as preserving local distances at the cost of compromsed global distances. """) return True if __name__ == '__main__': main()