Spaces:

jphwang
/

reduce_dimensions

Runtime error

File size: 5,135 Bytes

f1a15ae

# ========== (c) JP Hwang 30/9/2022  ==========

import logging
import pandas as pd
from pathlib import Path
import streamlit as st
import app
from sklearn.decomposition import PCA, FastICA, LatentDirichletAllocation
from sklearn.manifold import TSNE
import plotly.express as px

# ===== SET UP LOGGER =====
logger = logging.getLogger(__name__)
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
sh = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
sh.setFormatter(formatter)
root_logger.addHandler(sh)
# ===== END LOGGER SETUP =====

desired_width = 320
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', desired_width)

sizes = [1, 20, 30]


def preproc_data(datadir=None):
    if datadir is None:
        datadir = Path('data')

    df = pd.read_csv(datadir/'colors.csv', names=['simple_name', 'name', 'hex', 'r', 'g', 'b'])

    # Preprocessing
    df['rgb'] = df.apply(lambda x: f'rgb({x.r}, {x.g}, {x.b})', axis=1)

    # Get top 'basic' color names
    df = df.assign(category=df.simple_name.apply(lambda x: x.split('_')[-1]))

    # Set default size attribute
    df['size'] = sizes[0]
    return df


def build_chart(df_in):
    fig = px.scatter_3d(df_in, x='r', y='g', z='b',
                        template='plotly_white',
                        color=df_in['simple_name'],
                        color_discrete_sequence=df_in['rgb'],
                        size='size',
                        hover_data=['name'])
    fig.update_layout(
        showlegend=False,
        margin=dict(l=5, r=5, t=20, b=5)
    )
    return fig


def main():
    st.title("Dimensionality Reduction: Explained")
    st.subheader("Using colors to explain how dimensionality reduction techniques work in princple.")
    st.markdown("""
    [Previously](https://huggingface.co/spaces/jphwang/colorful_vectors), you saw how vectors can represent information as sets of numbers. 
    To recap, colors can be represented (for example) as three numbers like `(255, 0, 255)`,
    or `(80, 200, 120)`, where each number means something. 

    In this case they were (R)ed, (G)reen and (B)lue values, but lots of other systems exist like 
    [HCL or HSV](https://en.wikipedia.org/wiki/HSL_and_HSV), [CMYK](https://en.wikipedia.org/wiki/CMYK_color_model)
    or [RGBA](https://en.wikipedia.org/wiki/RGBA_color_model). 

    Now, you might have noticed that some of these representations include more than three numbers. And given that
    we live in a three-dimensional space, we can't easily represent 4+ dimensional vectors. So, what're we to do?

    One answer is to use what are called dimensionality reduction techniques, which will produce an output like the below.
    """)
    df = preproc_data(datadir=Path('data'))

    algo_options = [PCA, FastICA, LatentDirichletAllocation, TSNE]
    algo_names = [str(i.__name__) for i in algo_options]
    algos_dict = dict(zip(algo_names, algo_options))

    algo_sel = st.radio(label='Select your algorithm', options=algos_dict, index=0)

    rgb_arr = df[['r', 'g', 'b']].values

    algo = algos_dict[algo_sel]
    if algo != TSNE:
        reducer = algo(n_components=2)
        reducer.fit(rgb_arr.transpose())
        vals = reducer.components_.transpose()
    else:
        reducer = algo(n_components=2)
        vals = reducer.fit_transform(rgb_arr)
    df['dimension_a'] = vals[:, 0]
    df['dimension_b'] = vals[:, 1]

    red_fig = px.scatter(df, x='dimension_a', y='dimension_b',
                         title=f'RGB values represented in 2-D using {algo.__name__}',
                         template='plotly_white',
                         color=df['simple_name'],
                         color_discrete_sequence=df['rgb'],
                         width=500, height=500,
                         size='size',
                         size_max=10,
                         hover_data=['name'])
    red_fig.update_layout(
        showlegend=False,
        paper_bgcolor='white',
        plot_bgcolor='white',
        margin=dict(l=5, r=5, t=40, b=5)
    )

    st.markdown('-----')
    st.plotly_chart(red_fig)

    st.markdown('-----')
    st.markdown("""
    For reference and as a reminder, here is the original data in 3-D again:
    """)

    fig = build_chart(df)
    st.plotly_chart(fig, use_container_width=True)

    st.markdown('-----')
    st.markdown("""
    See how the 3-dimensional color information was squeezed into two? 
    We can visually confirm that the algorithms have done something *sensible*, because similar colors
    still appear together.  

    Having said that, obviously *some* information will get lost in converting information that's contained in 
    three dimensions to only two. 

    So these models make particular choices, which result in the outputs being different. 
    Some models like PCA aim to lose the least amount of overall information, while others like t-SNE 
    have specific goals, such as preserving local distances at the cost of compromsed global distances. 
    """)

    return True


if __name__ == '__main__':
    main()