Embeddings / app.py
de-Rodrigo's picture
Refactor Code
f872421
raw
history blame
6.9 kB
import streamlit as st
import pandas as pd
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.palettes import Reds9, Blues9
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
TOOLTIPS = """
<div>
<div>
<img src="@img{safe}" style="width:128px; height:auto; float: left; margin: 0px 15px 15px 0px;" alt="@img" border="2"></img>
</div>
<div>
<span style="font-size: 17px; font-weight: bold;">@label</span>
</div>
</div>
"""
def config_style():
st.markdown("""
<style>
.main-title { font-size: 50px; color: #4CAF50; text-align: center; }
.sub-title { font-size: 30px; color: #555; }
.custom-text { font-size: 18px; line-height: 1.5; }
</style>
""", unsafe_allow_html=True)
st.markdown('<h1 class="main-title">Merit Embeddings 馃帓馃搩馃弳</h1>', unsafe_allow_html=True)
st.markdown('<h2 class="sub-title">Donut 馃</h2>', unsafe_allow_html=True)
st.markdown(
"""
<p class="custom-text">
Se cargan ambas versiones de los embeddings y se aplica una reducci贸n dimensional sobre el conjunto combinado.
Los puntos de la versi贸n real se muestran como <strong>c铆rculos</strong> (tonos de rojo)
y los de la es_digital_seq como <strong>cuadrados</strong> (tonos de azul).
</p>
""", unsafe_allow_html=True)
def load_embeddings():
df_real = pd.read_csv("data/donut_de_Rodrigo_merit_secret_all_embeddings.csv")
df_es_digital_seq = pd.read_csv("data/donut_de_Rodrigo_merit_es-digital-seq_embeddings.csv")
embeddings = {
"real": df_real,
"es-digital-seq": df_es_digital_seq
}
return embeddings
def reducer_selector(df_combined, embedding_cols):
reduction_method = st.selectbox("Seleccione m茅todo de reducci贸n:", options=["PCA", "t-SNE"])
all_embeddings = df_combined[embedding_cols].values
if reduction_method == "PCA":
reducer = PCA(n_components=2)
else:
reducer = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate=200)
reduced = reducer.fit_transform(all_embeddings)
return reduced
def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping):
for label in selected_labels:
subset = df[df['label'] == label]
if subset.empty:
continue
source = ColumnDataSource(data=dict(
x = subset['x'],
y = subset['y'],
label = subset['label'],
img = subset['img']
))
color = color_mapping[label]
if marker == "circle":
fig.circle('x', 'y', size=10, source=source,
fill_color=color, line_color=color,
legend_label=f"{label} (Real)")
elif marker == "square":
fig.square('x', 'y', size=4, source=source, fill_color=color, line_color=color,
legend_label=f"{label} (Sint茅tico)")
def get_color_maps(selected_subsets: dict):
# real
num_real = len(selected_subsets["real"])
if num_real <= 9:
red_palette = Reds9[:num_real]
else:
red_palette = (Reds9 * ((num_real // 9) + 1))[:num_real]
color_mapping_real = {label: red_palette[i] for i, label in enumerate(sorted(selected_subsets["real"]))}
# es-digital-seq
num_es_digital_seq = len(selected_subsets["es-digital-seq"])
if num_es_digital_seq <= 9:
blue_palette = Blues9[:num_es_digital_seq]
else:
blue_palette = (Blues9 * ((num_es_digital_seq // 9) + 1))[:num_es_digital_seq]
color_mapping_es_digital_seq = {label: blue_palette[i] for i, label in enumerate(sorted(selected_subsets["es-digital-seq"]))}
# Gather color maps
color_maps = {
"real": color_mapping_real,
"es-digital-seq": color_mapping_es_digital_seq
}
return color_maps
def split_versions(df_combined, reduced):
df_combined['x'] = reduced[:, 0]
df_combined['y'] = reduced[:, 1]
df_real_reduced = df_combined[df_combined["version"] == "real"].copy()
df_es_digital_seq_reduced = df_combined[df_combined["version"] == "es_digital_seq"].copy()
# Obtener los subsets 煤nicos de cada versi贸n
unique_subsets_real = sorted(df_real_reduced['label'].unique().tolist())
unique_subsets_es_digital_seq = sorted(df_es_digital_seq_reduced['label'].unique().tolist())
unique_subsets = {
"real": unique_subsets_real,
"es-digital-seq": unique_subsets_es_digital_seq,
}
dfs_reduced = {
"real": df_real_reduced,
"es-digital-seq": df_es_digital_seq_reduced,
}
return dfs_reduced, unique_subsets
def subset_selectors(unique_subsets: dict):
selected_subsets_real = st.multiselect("Seleccione subsets para visualizar (Real):",
options=unique_subsets["real"],
default=unique_subsets["real"])
selected_subsets_es_digital_seq = st.multiselect("Seleccione subsets para visualizar (Sint茅tico):",
options=unique_subsets["es-digital-seq"],
default=unique_subsets["es-digital-seq"])
selected_subsets = {
"real": selected_subsets_real,
"es-digital-seq": selected_subsets_es_digital_seq
}
return selected_subsets
def create_figure(dfs_reduced, selected_subsets: dict, color_maps: dict):
fig = figure(width=600, height=600, tooltips=TOOLTIPS,
title="")
add_dataset_to_fig(fig, dfs_reduced["real"], selected_subsets["real"],
marker="circle", color_mapping=color_maps["real"])
add_dataset_to_fig(fig, dfs_reduced["es-digital-seq"], selected_subsets["es-digital-seq"],
marker="square", color_mapping=color_maps["es-digital-seq"])
fig.legend.location = "top_right"
fig.legend.click_policy = "hide"
return fig
def main():
config_style()
embeddings_dfs = load_embeddings()
embeddings_dfs["real"]["version"] = "real"
embeddings_dfs["es-digital-seq"]["version"] = "es_digital_seq"
embedding_cols = [col for col in embeddings_dfs["real"].columns if col.startswith("dim_")]
# Combine dataframes to apply method reduction
df_combined = pd.concat([embeddings_dfs["real"], embeddings_dfs["es-digital-seq"]], ignore_index=True)
reduced = reducer_selector(df_combined, embedding_cols)
# Split back the different versions
dfs_reduced, unique_subsets = split_versions(df_combined, reduced)
selected_subsets = subset_selectors(unique_subsets)
color_maps = get_color_maps(selected_subsets)
figure = create_figure(dfs_reduced, selected_subsets, color_maps)
st.bokeh_chart(figure)
if __name__ == "__main__":
main()