Embeddings / app.py
de-Rodrigo's picture
Donut Ready
757102e
raw
history blame
19.1 kB
import streamlit as st
import pandas as pd
import numpy as np
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, DataTable, TableColumn, CustomJS, Select, Button
from bokeh.layouts import column
from bokeh.palettes import Reds9, Blues9, Oranges9, Purples9, Greys9, BuGn9, Greens9
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import io
import ot
TOOLTIPS = """
<div>
<div>
<img src="@img{safe}" style="width:128px; height:auto; float: left; margin: 0px 15px 15px 0px;" alt="@img" border="2"></img>
</div>
<div>
<span style="font-size: 17px; font-weight: bold;">@label</span>
</div>
</div>
"""
def config_style():
st.markdown("""
<style>
.main-title { font-size: 50px; color: #4CAF50; text-align: center; }
.sub-title { font-size: 30px; color: #555; }
.custom-text { font-size: 18px; line-height: 1.5; }
.bk-legend {
max-height: 200px;
overflow-y: auto;
}
</style>
""", unsafe_allow_html=True)
st.markdown('<h1 class="main-title">Merit Embeddings 馃帓馃搩馃弳</h1>', unsafe_allow_html=True)
# Carga los datos y asigna versiones de forma uniforme
def load_embeddings(model):
if model == "Donut":
df_real = pd.read_csv("data/donut_de_Rodrigo_merit_secret_all_embeddings.csv")
df_par = pd.read_csv("data/donut_de_Rodrigo_merit_es-digital-paragraph-degradation-seq_embeddings.csv")
df_line = pd.read_csv("data/donut_de_Rodrigo_merit_es-digital-line-degradation-seq_embeddings.csv")
df_seq = pd.read_csv("data/donut_de_Rodrigo_merit_es-digital-seq_embeddings.csv")
df_rot = pd.read_csv("data/donut_de_Rodrigo_merit_es-digital-rotation-degradation-seq_embeddings.csv")
df_zoom = pd.read_csv("data/donut_de_Rodrigo_merit_es-digital-zoom-degradation-seq_embeddings.csv")
df_render = pd.read_csv("data/donut_de_Rodrigo_merit_es-render-seq_embeddings.csv")
df_real["version"] = "real"
df_par["version"] = "synthetic"
df_line["version"] = "synthetic"
df_seq["version"] = "synthetic"
df_rot["version"] = "synthetic"
df_zoom["version"] = "synthetic"
df_render["version"] = "synthetic"
# Se asigna la fuente
df_par["source"] = "es-digital-paragraph-degradation-seq"
df_line["source"] = "es-digital-line-degradation-seq"
df_seq["source"] = "es-digital-seq"
df_rot["source"] = "es-digital-rotation-degradation-seq"
df_zoom["source"] = "es-digital-zoom-degradation-seq"
df_render["source"] = "es-render-seq"
return {"real": df_real, "synthetic": pd.concat([df_seq, df_line, df_par, df_rot, df_zoom, df_render], ignore_index=True)}
elif model == "Idefics2":
df_real = pd.read_csv("data/idefics2_de_Rodrigo_merit_secret_britanico_embeddings.csv")
df_seq = pd.read_csv("data/idefics2_de_Rodrigo_merit_es-digital-seq_embeddings.csv")
df_real["version"] = "real"
df_seq["version"] = "synthetic"
df_seq["source"] = "es-digital-seq"
return {"real": df_real, "synthetic": df_seq}
else:
st.error("Modelo no reconocido")
return None
# Selecci贸n de reducci贸n dimensional
def reducer_selector(df_combined, embedding_cols):
reduction_method = st.selectbox("Select Dimensionality Reduction Method:", options=["PCA", "t-SNE"])
all_embeddings = df_combined[embedding_cols].values
if reduction_method == "PCA":
reducer = PCA(n_components=2)
else:
reducer = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate=200)
return reducer.fit_transform(all_embeddings)
# Funci贸n para agregar datos reales (por cada etiqueta)
def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping, group_label):
renderers = {}
for label in selected_labels:
subset = df[df['label'] == label]
if subset.empty:
continue
source = ColumnDataSource(data=dict(
x=subset['x'],
y=subset['y'],
label=subset['label'],
img=subset.get('img', "")
))
color = color_mapping[label]
legend_label = f"{label} ({group_label})"
if marker == "circle":
r = fig.circle('x', 'y', size=10, source=source,
fill_color=color, line_color=color,
legend_label=legend_label)
elif marker == "square":
r = fig.square('x', 'y', size=10, source=source,
fill_color=color, line_color=color,
legend_label=legend_label)
elif marker == "triangle":
r = fig.triangle('x', 'y', size=12, source=source,
fill_color=color, line_color=color,
legend_label=legend_label)
renderers[label + f" ({group_label})"] = r
return renderers
# Nueva funci贸n para plotear sint茅ticos de forma granular pero con leyenda agrupada por source
def add_synthetic_dataset_to_fig(fig, df, labels, marker, color_mapping, group_label):
renderers = {}
for label in labels:
subset = df[df['label'] == label]
if subset.empty:
continue
source_obj = ColumnDataSource(data=dict(
x=subset['x'],
y=subset['y'],
label=subset['label'],
img=subset.get('img', "")
))
# Se usa el color granular asignado a cada etiqueta
color = color_mapping[label]
# La leyenda se asigna al nombre del source para que se agrupe
legend_label = group_label
if marker == "square":
r = fig.square('x', 'y', size=10, source=source_obj,
fill_color=color, line_color=color,
legend_label=legend_label)
elif marker == "triangle":
r = fig.triangle('x', 'y', size=12, source=source_obj,
fill_color=color, line_color=color,
legend_label=legend_label)
elif marker == "inverted_triangle":
r = fig.inverted_triangle('x', 'y', size=12, source=source_obj,
fill_color=color, line_color=color,
legend_label=legend_label)
elif marker == "diamond":
r = fig.diamond('x', 'y', size=10, source=source_obj,
fill_color=color, line_color=color,
legend_label=legend_label)
elif marker == "cross":
r = fig.cross('x', 'y', size=12, source=source_obj,
fill_color=color, line_color=color,
legend_label=legend_label)
elif marker == "x":
r = fig.x('x', 'y', size=12, source=source_obj,
fill_color=color, line_color=color,
legend_label=legend_label)
elif marker == "asterisk":
r = fig.asterisk('x', 'y', size=12, source=source_obj,
fill_color=color, line_color=color,
legend_label=legend_label)
else:
r = fig.circle('x', 'y', size=10, source=source_obj,
fill_color=color, line_color=color,
legend_label=legend_label)
renderers[label + f" ({group_label})"] = r
return renderers
def get_color_maps(unique_subsets):
color_map = {}
# Para reales se asigna color para cada etiqueta
num_real = len(unique_subsets["real"])
red_palette = Reds9[:num_real] if num_real <= 9 else (Reds9 * ((num_real // 9) + 1))[:num_real]
color_map["real"] = {label: red_palette[i] for i, label in enumerate(sorted(unique_subsets["real"]))}
# Para sint茅ticos se asigna color de forma granular: para cada source se mapea cada etiqueta
color_map["synthetic"] = {}
for source, labels in unique_subsets["synthetic"].items():
if source == "es-digital-seq":
palette = Blues9[:len(labels)] if len(labels) <= 9 else (Blues9 * ((len(labels)//9)+1))[:len(labels)]
elif source == "es-digital-line-degradation-seq":
palette = Purples9[:len(labels)] if len(labels) <= 9 else (Purples9 * ((len(labels)//9)+1))[:len(labels)]
elif source == "es-digital-paragraph-degradation-seq":
palette = BuGn9[:len(labels)] if len(labels) <= 9 else (BuGn9 * ((len(labels)//9)+1))[:len(labels)]
elif source == "es-digital-rotation-degradation-seq":
palette = Greys9[:len(labels)] if len(labels) <= 9 else (Greys9 * ((len(labels)//9)+1))[:len(labels)]
elif source == "es-digital-zoom-degradation-seq":
palette = Oranges9[:len(labels)] if len(labels) <= 9 else (Oranges9 * ((len(labels)//9)+1))[:len(labels)]
elif source == "es-render-seq":
palette = Greens9[:len(labels)] if len(labels) <= 9 else (Greens9 * ((len(labels)//9)+1))[:len(labels)]
else:
palette = Blues9[:len(labels)] if len(labels) <= 9 else (Blues9 * ((len(labels)//9)+1))[:len(labels)]
color_map["synthetic"][source] = {label: palette[i] for i, label in enumerate(sorted(labels))}
return color_map
def split_versions(df_combined, reduced):
df_combined['x'] = reduced[:, 0]
df_combined['y'] = reduced[:, 1]
df_real = df_combined[df_combined["version"] == "real"].copy()
df_synth = df_combined[df_combined["version"] == "synthetic"].copy()
# Extraer etiquetas 煤nicas para reales
unique_real = sorted(df_real['label'].unique().tolist())
# Para sint茅ticos, se agrupan las etiquetas por source
unique_synth = {}
for source in df_synth["source"].unique():
unique_synth[source] = sorted(df_synth[df_synth["source"] == source]['label'].unique().tolist())
df_dict = {"real": df_real, "synthetic": df_synth}
# Para los reales se guarda la lista, y para sint茅ticos el diccionario
unique_subsets = {"real": unique_real, "synthetic": unique_synth}
return df_dict, unique_subsets
def create_figure(dfs, unique_subsets, color_maps, model_name):
fig = figure(width=600, height=600, tools="wheel_zoom,pan,reset,save", active_scroll="wheel_zoom", tooltips=TOOLTIPS, title="")
# Datos reales: se mantienen granulares en plot y en leyenda
real_renderers = add_dataset_to_fig(fig, dfs["real"], unique_subsets["real"],
marker="circle", color_mapping=color_maps["real"],
group_label="Real")
# Diccionario de asignaci贸n de marcadores para sint茅ticos por source
marker_mapping = {
"es-digital-paragraph-degradation-seq": "x",
"es-digital-line-degradation-seq": "cross",
"es-digital-seq": "triangle",
"es-digital-rotation-degradation-seq": "diamond",
"es-digital-zoom-degradation-seq": "asterisk",
"es-render-seq": "inverted_triangle"
}
# Datos sint茅ticos: se plotean granularmente (por etiqueta) pero se agrupa la leyenda por source
synthetic_renderers = {}
synth_df = dfs["synthetic"]
for source in unique_subsets["synthetic"]:
df_source = synth_df[synth_df["source"] == source]
marker = marker_mapping.get(source, "square") # Por defecto "square" si no se encuentra
renderers = add_synthetic_dataset_to_fig(fig, df_source, unique_subsets["synthetic"][source],
marker=marker,
color_mapping=color_maps["synthetic"][source],
group_label=source)
synthetic_renderers.update(renderers)
fig.legend.location = "top_right"
fig.legend.click_policy = "hide"
show_legend = st.checkbox("Show Legend", value=False, key=f"legend_{model_name}")
fig.legend.visible = show_legend
return fig, real_renderers, synthetic_renderers
# Calcula los centros de cada cluster (por grupo)
def calculate_cluster_centers(df, labels):
centers = {}
for label in labels:
subset = df[df['label'] == label]
if not subset.empty:
centers[label] = (subset['x'].mean(), subset['y'].mean())
return centers
# Calcula la distancia Wasserstein de cada subset sint茅tico respecto a cada cluster real (por cluster y global)
def compute_wasserstein_distances_synthetic_individual(synthetic_df: pd.DataFrame, df_real: pd.DataFrame, real_labels: list) -> pd.DataFrame:
distances = {}
groups = synthetic_df.groupby(['source', 'label'])
for (source, label), group in groups:
key = f"{label} ({source})"
data = group[['x', 'y']].values
n = data.shape[0]
weights = np.ones(n) / n
distances[key] = {}
for real_label in real_labels:
real_data = df_real[df_real['label'] == real_label][['x','y']].values
m = real_data.shape[0]
weights_real = np.ones(m) / m
M = ot.dist(data, real_data, metric='euclidean')
distances[key][real_label] = ot.emd2(weights, weights_real, M)
# Distancia global por fuente
for source, group in synthetic_df.groupby('source'):
key = f"Global ({source})"
data = group[['x','y']].values
n = data.shape[0]
weights = np.ones(n) / n
distances[key] = {}
for real_label in real_labels:
real_data = df_real[df_real['label'] == real_label][['x','y']].values
m = real_data.shape[0]
weights_real = np.ones(m) / m
M = ot.dist(data, real_data, metric='euclidean')
distances[key][real_label] = ot.emd2(weights, weights_real, M)
return pd.DataFrame(distances).T
def create_table(df_distances):
df_table = df_distances.copy()
df_table.reset_index(inplace=True)
df_table.rename(columns={'index': 'Synthetic'}, inplace=True)
min_row = {"Synthetic": "Min."}
mean_row = {"Synthetic": "Mean"}
max_row = {"Synthetic": "Max."}
for col in df_table.columns:
if col != "Synthetic":
min_row[col] = df_table[col].min()
mean_row[col] = df_table[col].mean()
max_row[col] = df_table[col].max()
df_table = pd.concat([df_table, pd.DataFrame([min_row, mean_row, max_row])], ignore_index=True)
source_table = ColumnDataSource(df_table)
columns = [TableColumn(field='Synthetic', title='Synthetic')]
for col in df_table.columns:
if col != 'Synthetic':
columns.append(TableColumn(field=col, title=col))
total_height = 30 + len(df_table)*28
data_table = DataTable(source=source_table, columns=columns, sizing_mode='stretch_width', height=total_height)
return data_table, df_table, source_table
def run_model(model_name):
embeddings = load_embeddings(model_name)
if embeddings is None:
return
embedding_cols = [col for col in embeddings["real"].columns if col.startswith("dim_")]
df_combined = pd.concat(list(embeddings.values()), ignore_index=True)
st.markdown('<h6 class="sub-title">Select Dimensionality Reduction Method</h6>', unsafe_allow_html=True)
reduction_method = st.selectbox("", options=["t-SNE", "PCA"], key=f"reduction_{model_name}")
if reduction_method == "PCA":
reducer = PCA(n_components=2)
else:
reducer = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate=200)
reduced = reducer.fit_transform(df_combined[embedding_cols].values)
dfs_reduced, unique_subsets = split_versions(df_combined, reduced)
color_maps = get_color_maps(unique_subsets)
fig, real_renderers, synthetic_renderers = create_figure(dfs_reduced, unique_subsets, color_maps, model_name)
centers_real = calculate_cluster_centers(dfs_reduced["real"], unique_subsets["real"])
df_distances = compute_wasserstein_distances_synthetic_individual(dfs_reduced["synthetic"], dfs_reduced["real"], unique_subsets["real"])
data_table, df_table, source_table = create_table(df_distances)
real_subset_names = list(df_table.columns[1:])
real_select = Select(title="", value=real_subset_names[0], options=real_subset_names)
reset_button = Button(label="Reset Colors", button_type="primary")
line_source = ColumnDataSource(data={'x': [], 'y': []})
fig.line('x', 'y', source=line_source, line_width=2, line_color='black')
real_centers_js = {k: [v[0], v[1]] for k, v in centers_real.items()}
synthetic_centers = {}
synth_labels = sorted(dfs_reduced["synthetic"]['label'].unique().tolist())
for label in synth_labels:
subset = dfs_reduced["synthetic"][dfs_reduced["synthetic"]['label'] == label]
synthetic_centers[label] = [subset['x'].mean(), subset['y'].mean()]
callback = CustomJS(args=dict(source=source_table, line_source=line_source,
synthetic_centers=synthetic_centers,
real_centers=real_centers_js,
real_select=real_select),
code="""
var selected = source.selected.indices;
if (selected.length > 0) {
var idx = selected[0];
var data = source.data;
var synth_label = data['Synthetic'][idx];
var real_label = real_select.value;
var syn_coords = synthetic_centers[synth_label];
var real_coords = real_centers[real_label];
line_source.data = {'x': [syn_coords[0], real_coords[0]], 'y': [syn_coords[1], real_coords[1]]};
line_source.change.emit();
} else {
line_source.data = {'x': [], 'y': []};
line_source.change.emit();
}
""")
source_table.selected.js_on_change('indices', callback)
real_select.js_on_change('value', callback)
reset_callback = CustomJS(args=dict(line_source=line_source),
code="""
line_source.data = {'x': [], 'y': []};
line_source.change.emit();
""")
reset_button.js_on_event("button_click", reset_callback)
buffer = io.BytesIO()
df_table.to_excel(buffer, index=False)
buffer.seek(0)
layout = column(fig, column(real_select, reset_button, data_table))
st.bokeh_chart(layout, use_container_width=True)
st.download_button(
label="Export Table",
data=buffer,
file_name=f"cluster_distances_{model_name}.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
key=f"download_button_excel_{model_name}"
)
def main():
config_style()
tabs = st.tabs(["Donut", "Idefics2"])
with tabs[0]:
st.markdown('<h2 class="sub-title">Donut 馃</h2>', unsafe_allow_html=True)
run_model("Donut")
with tabs[1]:
st.markdown('<h2 class="sub-title">Idefics2 馃</h2>', unsafe_allow_html=True)
run_model("Idefics2")
if __name__ == "__main__":
main()