Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
from bokeh.plotting import figure | |
from bokeh.models import ColumnDataSource | |
from bokeh.palettes import Reds9, Blues9 | |
from sklearn.decomposition import PCA | |
from sklearn.manifold import TSNE | |
TOOLTIPS = """ | |
<div> | |
<div> | |
<img src="@img{safe}" style="width:128px; height:auto; float: left; margin: 0px 15px 15px 0px;" alt="@img" border="2"></img> | |
</div> | |
<div> | |
<span style="font-size: 17px; font-weight: bold;">@label</span> | |
</div> | |
</div> | |
""" | |
def config_style(): | |
st.markdown(""" | |
<style> | |
.main-title { font-size: 50px; color: #4CAF50; text-align: center; } | |
.sub-title { font-size: 30px; color: #555; } | |
.custom-text { font-size: 18px; line-height: 1.5; } | |
</style> | |
""", unsafe_allow_html=True) | |
st.markdown('<h1 class="main-title">Merit Embeddings 馃帓馃搩馃弳</h1>', unsafe_allow_html=True) | |
st.markdown('<h2 class="sub-title">Donut 馃</h2>', unsafe_allow_html=True) | |
st.markdown( | |
""" | |
<p class="custom-text"> | |
Se cargan ambas versiones de los embeddings y se aplica una reducci贸n dimensional sobre el conjunto combinado. | |
Los puntos de la versi贸n real se muestran como <strong>c铆rculos</strong> (tonos de rojo) | |
y los de la es_digital_seq como <strong>cuadrados</strong> (tonos de azul). | |
</p> | |
""", unsafe_allow_html=True) | |
def load_embeddings(): | |
df_real = pd.read_csv("data/donut_de_Rodrigo_merit_secret_all_embeddings.csv") | |
df_es_digital_seq = pd.read_csv("data/donut_de_Rodrigo_merit_es-digital-seq_embeddings.csv") | |
embeddings = { | |
"real": df_real, | |
"es-digital-seq": df_es_digital_seq | |
} | |
return embeddings | |
def reducer_selector(df_combined, embedding_cols): | |
reduction_method = st.selectbox("Seleccione m茅todo de reducci贸n:", options=["PCA", "t-SNE"]) | |
all_embeddings = df_combined[embedding_cols].values | |
if reduction_method == "PCA": | |
reducer = PCA(n_components=2) | |
else: | |
reducer = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate=200) | |
reduced = reducer.fit_transform(all_embeddings) | |
return reduced | |
def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping): | |
for label in selected_labels: | |
subset = df[df['label'] == label] | |
if subset.empty: | |
continue | |
source = ColumnDataSource(data=dict( | |
x = subset['x'], | |
y = subset['y'], | |
label = subset['label'], | |
img = subset['img'] | |
)) | |
color = color_mapping[label] | |
if marker == "circle": | |
fig.circle('x', 'y', size=10, source=source, | |
fill_color=color, line_color=color, | |
legend_label=f"{label} (Real)") | |
elif marker == "square": | |
fig.square('x', 'y', size=4, source=source, fill_color=color, line_color=color, | |
legend_label=f"{label} (Sint茅tico)") | |
def get_color_maps(selected_subsets: dict): | |
# real | |
num_real = len(selected_subsets["real"]) | |
if num_real <= 9: | |
red_palette = Reds9[:num_real] | |
else: | |
red_palette = (Reds9 * ((num_real // 9) + 1))[:num_real] | |
color_mapping_real = {label: red_palette[i] for i, label in enumerate(sorted(selected_subsets["real"]))} | |
# es-digital-seq | |
num_es_digital_seq = len(selected_subsets["es-digital-seq"]) | |
if num_es_digital_seq <= 9: | |
blue_palette = Blues9[:num_es_digital_seq] | |
else: | |
blue_palette = (Blues9 * ((num_es_digital_seq // 9) + 1))[:num_es_digital_seq] | |
color_mapping_es_digital_seq = {label: blue_palette[i] for i, label in enumerate(sorted(selected_subsets["es-digital-seq"]))} | |
# Gather color maps | |
color_maps = { | |
"real": color_mapping_real, | |
"es-digital-seq": color_mapping_es_digital_seq | |
} | |
return color_maps | |
def split_versions(df_combined, reduced): | |
df_combined['x'] = reduced[:, 0] | |
df_combined['y'] = reduced[:, 1] | |
df_real_reduced = df_combined[df_combined["version"] == "real"].copy() | |
df_es_digital_seq_reduced = df_combined[df_combined["version"] == "es_digital_seq"].copy() | |
# Obtener los subsets 煤nicos de cada versi贸n | |
unique_subsets_real = sorted(df_real_reduced['label'].unique().tolist()) | |
unique_subsets_es_digital_seq = sorted(df_es_digital_seq_reduced['label'].unique().tolist()) | |
unique_subsets = { | |
"real": unique_subsets_real, | |
"es-digital-seq": unique_subsets_es_digital_seq, | |
} | |
dfs_reduced = { | |
"real": df_real_reduced, | |
"es-digital-seq": df_es_digital_seq_reduced, | |
} | |
return dfs_reduced, unique_subsets | |
def subset_selectors(unique_subsets: dict): | |
selected_subsets_real = st.multiselect("Seleccione subsets para visualizar (Real):", | |
options=unique_subsets["real"], | |
default=unique_subsets["real"]) | |
selected_subsets_es_digital_seq = st.multiselect("Seleccione subsets para visualizar (Sint茅tico):", | |
options=unique_subsets["es-digital-seq"], | |
default=unique_subsets["es-digital-seq"]) | |
selected_subsets = { | |
"real": selected_subsets_real, | |
"es-digital-seq": selected_subsets_es_digital_seq | |
} | |
return selected_subsets | |
def create_figure(dfs_reduced, selected_subsets: dict, color_maps: dict): | |
fig = figure(width=600, height=600, tooltips=TOOLTIPS, | |
title="") | |
add_dataset_to_fig(fig, dfs_reduced["real"], selected_subsets["real"], | |
marker="circle", color_mapping=color_maps["real"]) | |
add_dataset_to_fig(fig, dfs_reduced["es-digital-seq"], selected_subsets["es-digital-seq"], | |
marker="square", color_mapping=color_maps["es-digital-seq"]) | |
fig.legend.location = "top_right" | |
fig.legend.click_policy = "hide" | |
return fig | |
def main(): | |
config_style() | |
embeddings_dfs = load_embeddings() | |
embeddings_dfs["real"]["version"] = "real" | |
embeddings_dfs["es-digital-seq"]["version"] = "es_digital_seq" | |
embedding_cols = [col for col in embeddings_dfs["real"].columns if col.startswith("dim_")] | |
# Combine dataframes to apply method reduction | |
df_combined = pd.concat([embeddings_dfs["real"], embeddings_dfs["es-digital-seq"]], ignore_index=True) | |
reduced = reducer_selector(df_combined, embedding_cols) | |
# Split back the different versions | |
dfs_reduced, unique_subsets = split_versions(df_combined, reduced) | |
selected_subsets = subset_selectors(unique_subsets) | |
color_maps = get_color_maps(selected_subsets) | |
figure = create_figure(dfs_reduced, selected_subsets, color_maps) | |
st.bokeh_chart(figure) | |
if __name__ == "__main__": | |
main() | |