Spaces:

jphwang
/

reduce_dimensions

Runtime error

JP Hwang

First working version

f1a15ae almost 3 years ago

5.14 kB

	# ========== (c) JP Hwang 30/9/2022 ==========

	import logging
	import pandas as pd
	from pathlib import Path
	import streamlit as st
	import app
	from sklearn.decomposition import PCA, FastICA, LatentDirichletAllocation
	from sklearn.manifold import TSNE
	import plotly.express as px

	# ===== SET UP LOGGER =====
	logger = logging.getLogger(__name__)
	root_logger = logging.getLogger()
	root_logger.setLevel(logging.INFO)
	sh = logging.StreamHandler()
	formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	sh.setFormatter(formatter)
	root_logger.addHandler(sh)
	# ===== END LOGGER SETUP =====

	desired_width = 320
	pd.set_option('display.max_columns', 20)
	pd.set_option('display.width', desired_width)

	sizes = [1, 20, 30]


	def preproc_data(datadir=None):
	if datadir is None:
	datadir = Path('data')

	df = pd.read_csv(datadir/'colors.csv', names=['simple_name', 'name', 'hex', 'r', 'g', 'b'])

	# Preprocessing
	df['rgb'] = df.apply(lambda x: f'rgb({x.r}, {x.g}, {x.b})', axis=1)

	# Get top 'basic' color names
	df = df.assign(category=df.simple_name.apply(lambda x: x.split('_')[-1]))

	# Set default size attribute
	df['size'] = sizes[0]
	return df


	def build_chart(df_in):
	fig = px.scatter_3d(df_in, x='r', y='g', z='b',
	template='plotly_white',
	color=df_in['simple_name'],
	color_discrete_sequence=df_in['rgb'],
	size='size',
	hover_data=['name'])
	fig.update_layout(
	showlegend=False,
	margin=dict(l=5, r=5, t=20, b=5)
	)
	return fig


	def main():
	st.title("Dimensionality Reduction: Explained")
	st.subheader("Using colors to explain how dimensionality reduction techniques work in princple.")
	st.markdown("""
	[Previously](https://huggingface.co/spaces/jphwang/colorful_vectors), you saw how vectors can represent information as sets of numbers.
	To recap, colors can be represented (for example) as three numbers like `(255, 0, 255)`,
	or `(80, 200, 120)`, where each number means something.

	In this case they were (R)ed, (G)reen and (B)lue values, but lots of other systems exist like
	[HCL or HSV](https://en.wikipedia.org/wiki/HSL_and_HSV), [CMYK](https://en.wikipedia.org/wiki/CMYK_color_model)
	or [RGBA](https://en.wikipedia.org/wiki/RGBA_color_model).

	Now, you might have noticed that some of these representations include more than three numbers. And given that
	we live in a three-dimensional space, we can't easily represent 4+ dimensional vectors. So, what're we to do?

	One answer is to use what are called dimensionality reduction techniques, which will produce an output like the below.
	""")
	df = preproc_data(datadir=Path('data'))

	algo_options = [PCA, FastICA, LatentDirichletAllocation, TSNE]
	algo_names = [str(i.__name__) for i in algo_options]
	algos_dict = dict(zip(algo_names, algo_options))

	algo_sel = st.radio(label='Select your algorithm', options=algos_dict, index=0)

	rgb_arr = df[['r', 'g', 'b']].values

	algo = algos_dict[algo_sel]
	if algo != TSNE:
	reducer = algo(n_components=2)
	reducer.fit(rgb_arr.transpose())
	vals = reducer.components_.transpose()
	else:
	reducer = algo(n_components=2)
	vals = reducer.fit_transform(rgb_arr)
	df['dimension_a'] = vals[:, 0]
	df['dimension_b'] = vals[:, 1]

	red_fig = px.scatter(df, x='dimension_a', y='dimension_b',
	title=f'RGB values represented in 2-D using {algo.__name__}',
	template='plotly_white',
	color=df['simple_name'],
	color_discrete_sequence=df['rgb'],
	width=500, height=500,
	size='size',
	size_max=10,
	hover_data=['name'])
	red_fig.update_layout(
	showlegend=False,
	paper_bgcolor='white',
	plot_bgcolor='white',
	margin=dict(l=5, r=5, t=40, b=5)
	)

	st.markdown('-----')
	st.plotly_chart(red_fig)

	st.markdown('-----')
	st.markdown("""
	For reference and as a reminder, here is the original data in 3-D again:
	""")

	fig = build_chart(df)
	st.plotly_chart(fig, use_container_width=True)

	st.markdown('-----')
	st.markdown("""
	See how the 3-dimensional color information was squeezed into two?
	We can visually confirm that the algorithms have done something sensible, because similar colors
	still appear together.

	Having said that, obviously some information will get lost in converting information that's contained in
	three dimensions to only two.

	So these models make particular choices, which result in the outputs being different.
	Some models like PCA aim to lose the least amount of overall information, while others like t-SNE
	have specific goals, such as preserving local distances at the cost of compromsed global distances.
	""")

	return True


	if __name__ == '__main__':
	main()