Spaces:
Running
Running
Commit
路
09537d6
1
Parent(s):
c85d8a8
Include Pretraining Datasets
Browse files
app.py
CHANGED
@@ -40,15 +40,18 @@ def config_style():
|
|
40 |
""", unsafe_allow_html=True)
|
41 |
st.markdown('<h1 class="main-title">Merit Embeddings 馃帓馃搩馃弳</h1>', unsafe_allow_html=True)
|
42 |
|
43 |
-
def load_embeddings(model, version):
|
44 |
if model == "Donut":
|
45 |
df_real = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_secret_all_embeddings.csv")
|
46 |
-
df_par = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-digital-paragraph-degradation-
|
47 |
-
df_line = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-digital-line-degradation-
|
48 |
-
df_seq = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-digital-
|
49 |
-
df_rot = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-digital-rotation-degradation-
|
50 |
-
df_zoom = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-digital-zoom-degradation-
|
51 |
-
df_render = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-render-
|
|
|
|
|
|
|
52 |
df_real["version"] = "real"
|
53 |
df_par["version"] = "synthetic"
|
54 |
df_line["version"] = "synthetic"
|
@@ -56,23 +59,32 @@ def load_embeddings(model, version):
|
|
56 |
df_rot["version"] = "synthetic"
|
57 |
df_zoom["version"] = "synthetic"
|
58 |
df_render["version"] = "synthetic"
|
|
|
59 |
|
|
|
60 |
df_par["source"] = "es-digital-paragraph-degradation-seq"
|
61 |
df_line["source"] = "es-digital-line-degradation-seq"
|
62 |
df_seq["source"] = "es-digital-seq"
|
63 |
df_rot["source"] = "es-digital-rotation-degradation-seq"
|
64 |
df_zoom["source"] = "es-digital-zoom-degradation-seq"
|
65 |
df_render["source"] = "es-render-seq"
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
elif model == "Idefics2":
|
69 |
df_real = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_secret_britanico_embeddings.csv")
|
70 |
-
df_par = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-digital-paragraph-degradation-
|
71 |
-
df_line = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-digital-line-degradation-
|
72 |
-
df_seq = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-digital-
|
73 |
-
df_rot = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-digital-rotation-degradation-
|
74 |
-
df_zoom = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-digital-zoom-degradation-
|
75 |
-
df_render = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-render-
|
|
|
|
|
76 |
df_real["version"] = "real"
|
77 |
df_par["version"] = "synthetic"
|
78 |
df_line["version"] = "synthetic"
|
@@ -80,6 +92,7 @@ def load_embeddings(model, version):
|
|
80 |
df_rot["version"] = "synthetic"
|
81 |
df_zoom["version"] = "synthetic"
|
82 |
df_render["version"] = "synthetic"
|
|
|
83 |
|
84 |
df_par["source"] = "es-digital-paragraph-degradation-seq"
|
85 |
df_line["source"] = "es-digital-line-degradation-seq"
|
@@ -87,27 +100,38 @@ def load_embeddings(model, version):
|
|
87 |
df_rot["source"] = "es-digital-rotation-degradation-seq"
|
88 |
df_zoom["source"] = "es-digital-zoom-degradation-seq"
|
89 |
df_render["source"] = "es-render-seq"
|
90 |
-
|
|
|
|
|
|
|
|
|
91 |
|
92 |
else:
|
93 |
st.error("Modelo no reconocido")
|
94 |
return None
|
95 |
|
|
|
|
|
96 |
def split_versions(df_combined, reduced):
|
97 |
-
#
|
98 |
if reduced.shape[1] == 2:
|
99 |
df_combined['x'] = reduced[:, 0]
|
100 |
df_combined['y'] = reduced[:, 1]
|
101 |
df_real = df_combined[df_combined["version"] == "real"].copy()
|
102 |
df_synth = df_combined[df_combined["version"] == "synthetic"].copy()
|
|
|
|
|
103 |
unique_real = sorted(df_real['label'].unique().tolist())
|
104 |
unique_synth = {}
|
105 |
for source in df_synth["source"].unique():
|
106 |
unique_synth[source] = sorted(df_synth[df_synth["source"] == source]['label'].unique().tolist())
|
107 |
-
|
108 |
-
|
|
|
|
|
109 |
return df_dict, unique_subsets
|
110 |
|
|
|
111 |
def get_embedding_from_df(df):
|
112 |
# Retorna el embedding completo (4 dimensiones en este caso) guardado en la columna 'embedding'
|
113 |
if 'embedding' in df.columns:
|
@@ -212,11 +236,15 @@ def create_table(df_distances):
|
|
212 |
return data_table, df_table, source_table
|
213 |
|
214 |
def create_figure(dfs, unique_subsets, color_maps, model_name):
|
215 |
-
# Se crea
|
216 |
fig = figure(width=600, height=600, tools="wheel_zoom,pan,reset,save", active_scroll="wheel_zoom", tooltips=TOOLTIPS, title="")
|
|
|
|
|
217 |
real_renderers = add_dataset_to_fig(fig, dfs["real"], unique_subsets["real"],
|
218 |
marker="circle", color_mapping=color_maps["real"],
|
219 |
group_label="Real")
|
|
|
|
|
220 |
marker_mapping = {
|
221 |
"es-digital-paragraph-degradation-seq": "x",
|
222 |
"es-digital-line-degradation-seq": "cross",
|
@@ -236,11 +264,17 @@ def create_figure(dfs, unique_subsets, color_maps, model_name):
|
|
236 |
group_label=source)
|
237 |
synthetic_renderers.update(renderers)
|
238 |
|
|
|
|
|
|
|
|
|
|
|
239 |
fig.legend.location = "top_right"
|
240 |
fig.legend.click_policy = "hide"
|
241 |
show_legend = st.checkbox("Show Legend", value=False, key=f"legend_{model_name}")
|
242 |
fig.legend.visible = show_legend
|
243 |
-
return fig, real_renderers, synthetic_renderers
|
|
|
244 |
|
245 |
def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping, group_label):
|
246 |
renderers = {}
|
@@ -343,8 +377,15 @@ def get_color_maps(unique_subsets):
|
|
343 |
else:
|
344 |
palette = Blues9[:len(labels)] if len(labels) <= 9 else (Blues9 * ((len(labels)//9)+1))[:len(labels)]
|
345 |
color_map["synthetic"][source] = {label: palette[i] for i, label in enumerate(sorted(labels))}
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
return color_map
|
347 |
|
|
|
348 |
def calculate_cluster_centers(df, labels):
|
349 |
centers = {}
|
350 |
for label in labels:
|
@@ -485,8 +526,12 @@ def optimize_tsne_params(df_combined, embedding_cols, df_f1, distance_metric):
|
|
485 |
|
486 |
def run_model(model_name):
|
487 |
version = st.selectbox("Select Model Version:", options=["vanilla", "finetuned_real"], key=f"version_{model_name}")
|
|
|
|
|
|
|
|
|
488 |
|
489 |
-
embeddings = load_embeddings(model_name, version)
|
490 |
if embeddings is None:
|
491 |
return
|
492 |
embedding_cols = [col for col in embeddings["real"].columns if col.startswith("dim_")]
|
@@ -562,10 +607,13 @@ def run_model(model_name):
|
|
562 |
reset_button = Button(label="Reset Colors", button_type="primary")
|
563 |
line_source = ColumnDataSource(data={'x': [], 'y': []})
|
564 |
|
565 |
-
# Si el embedding es 2D se crea el scatter plot de embeddings;
|
566 |
-
# dado que con PCA ahora usamos 4 dimensiones, este bloque se omite para PCA
|
567 |
if (reduction_method == "t-SNE" and N_COMPONENTS == 2) or (reduction_method == "PCA" and N_COMPONENTS == 2):
|
568 |
-
fig, real_renderers, synthetic_renderers = create_figure(
|
|
|
|
|
|
|
|
|
|
|
569 |
fig.line('x', 'y', source=line_source, line_width=2, line_color='black')
|
570 |
centers_real = calculate_cluster_centers(result["dfs_reduced"]["real"], result["unique_subsets"]["real"])
|
571 |
real_centers_js = {k: [v[0], v[1]] for k, v in centers_real.items()}
|
@@ -633,4 +681,4 @@ def main():
|
|
633 |
run_model("Idefics2")
|
634 |
|
635 |
if __name__ == "__main__":
|
636 |
-
main()
|
|
|
40 |
""", unsafe_allow_html=True)
|
41 |
st.markdown('<h1 class="main-title">Merit Embeddings 馃帓馃搩馃弳</h1>', unsafe_allow_html=True)
|
42 |
|
43 |
+
def load_embeddings(model, version, embedding_prefix):
|
44 |
if model == "Donut":
|
45 |
df_real = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_secret_all_embeddings.csv")
|
46 |
+
df_par = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-digital-paragraph-degradation-seq_{embedding_prefix}embeddings.csv")
|
47 |
+
df_line = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-digital-line-degradation-seq_{embedding_prefix}embeddings.csv")
|
48 |
+
df_seq = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-digital-seq_{embedding_prefix}embeddings.csv")
|
49 |
+
df_rot = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-digital-rotation-degradation-seq_{embedding_prefix}embeddings.csv")
|
50 |
+
df_zoom = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-digital-zoom-degradation-seq_{embedding_prefix}embeddings.csv")
|
51 |
+
df_render = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-render-seq_{embedding_prefix}embeddings.csv")
|
52 |
+
df_pretratrained = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_aux_IIT-CDIP_{embedding_prefix}embeddings.csv")
|
53 |
+
|
54 |
+
# Asignar etiquetas de versi贸n
|
55 |
df_real["version"] = "real"
|
56 |
df_par["version"] = "synthetic"
|
57 |
df_line["version"] = "synthetic"
|
|
|
59 |
df_rot["version"] = "synthetic"
|
60 |
df_zoom["version"] = "synthetic"
|
61 |
df_render["version"] = "synthetic"
|
62 |
+
df_pretratrained["version"] = "pretrained"
|
63 |
|
64 |
+
# Asignar fuente (source)
|
65 |
df_par["source"] = "es-digital-paragraph-degradation-seq"
|
66 |
df_line["source"] = "es-digital-line-degradation-seq"
|
67 |
df_seq["source"] = "es-digital-seq"
|
68 |
df_rot["source"] = "es-digital-rotation-degradation-seq"
|
69 |
df_zoom["source"] = "es-digital-zoom-degradation-seq"
|
70 |
df_render["source"] = "es-render-seq"
|
71 |
+
# Si lo requieres, puedes asignar tambi茅n una fuente para pretrained
|
72 |
+
df_pretratrained["source"] = "pretrained"
|
73 |
+
|
74 |
+
return {"real": df_real,
|
75 |
+
"synthetic": pd.concat([df_seq, df_line, df_par, df_rot, df_zoom, df_render], ignore_index=True),
|
76 |
+
"pretrained": df_pretratrained}
|
77 |
|
78 |
elif model == "Idefics2":
|
79 |
df_real = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_secret_britanico_embeddings.csv")
|
80 |
+
df_par = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-digital-paragraph-degradation-seq_{embedding_prefix}embeddings.csv")
|
81 |
+
df_line = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-digital-line-degradation-seq_{embedding_prefix}embeddings.csv")
|
82 |
+
df_seq = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-digital-seq_{embedding_prefix}embeddings.csv")
|
83 |
+
df_rot = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-digital-rotation-degradation-seq_{embedding_prefix}embeddings.csv")
|
84 |
+
df_zoom = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-digital-zoom-degradation-seq_{embedding_prefix}embeddings.csv")
|
85 |
+
df_render = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-render-seq_{embedding_prefix}embeddings.csv")
|
86 |
+
df_pretratrained = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_pretrained_{embedding_prefix}embeddings.csv")
|
87 |
+
|
88 |
df_real["version"] = "real"
|
89 |
df_par["version"] = "synthetic"
|
90 |
df_line["version"] = "synthetic"
|
|
|
92 |
df_rot["version"] = "synthetic"
|
93 |
df_zoom["version"] = "synthetic"
|
94 |
df_render["version"] = "synthetic"
|
95 |
+
df_pretratrained["version"] = "pretrained"
|
96 |
|
97 |
df_par["source"] = "es-digital-paragraph-degradation-seq"
|
98 |
df_line["source"] = "es-digital-line-degradation-seq"
|
|
|
100 |
df_rot["source"] = "es-digital-rotation-degradation-seq"
|
101 |
df_zoom["source"] = "es-digital-zoom-degradation-seq"
|
102 |
df_render["source"] = "es-render-seq"
|
103 |
+
df_pretratrained["source"] = "pretrained"
|
104 |
+
|
105 |
+
return {"real": df_real,
|
106 |
+
"synthetic": pd.concat([df_seq, df_line, df_par, df_rot, df_zoom, df_render], ignore_index=True),
|
107 |
+
"pretrained": df_pretratrained}
|
108 |
|
109 |
else:
|
110 |
st.error("Modelo no reconocido")
|
111 |
return None
|
112 |
|
113 |
+
|
114 |
+
|
115 |
def split_versions(df_combined, reduced):
|
116 |
+
# Asignar las coordenadas si la reducci贸n es 2D
|
117 |
if reduced.shape[1] == 2:
|
118 |
df_combined['x'] = reduced[:, 0]
|
119 |
df_combined['y'] = reduced[:, 1]
|
120 |
df_real = df_combined[df_combined["version"] == "real"].copy()
|
121 |
df_synth = df_combined[df_combined["version"] == "synthetic"].copy()
|
122 |
+
df_pretrained = df_combined[df_combined["version"] == "pretrained"].copy()
|
123 |
+
|
124 |
unique_real = sorted(df_real['label'].unique().tolist())
|
125 |
unique_synth = {}
|
126 |
for source in df_synth["source"].unique():
|
127 |
unique_synth[source] = sorted(df_synth[df_synth["source"] == source]['label'].unique().tolist())
|
128 |
+
unique_pretrained = sorted(df_pretrained['label'].unique().tolist())
|
129 |
+
|
130 |
+
df_dict = {"real": df_real, "synthetic": df_synth, "pretrained": df_pretrained}
|
131 |
+
unique_subsets = {"real": unique_real, "synthetic": unique_synth, "pretrained": unique_pretrained}
|
132 |
return df_dict, unique_subsets
|
133 |
|
134 |
+
|
135 |
def get_embedding_from_df(df):
|
136 |
# Retorna el embedding completo (4 dimensiones en este caso) guardado en la columna 'embedding'
|
137 |
if 'embedding' in df.columns:
|
|
|
236 |
return data_table, df_table, source_table
|
237 |
|
238 |
def create_figure(dfs, unique_subsets, color_maps, model_name):
|
239 |
+
# Se crea el plot para el embedding reducido (asumiendo que es 2D)
|
240 |
fig = figure(width=600, height=600, tools="wheel_zoom,pan,reset,save", active_scroll="wheel_zoom", tooltips=TOOLTIPS, title="")
|
241 |
+
|
242 |
+
# Renderizar datos reales
|
243 |
real_renderers = add_dataset_to_fig(fig, dfs["real"], unique_subsets["real"],
|
244 |
marker="circle", color_mapping=color_maps["real"],
|
245 |
group_label="Real")
|
246 |
+
|
247 |
+
# Renderizar datos sint茅ticos (por fuente)
|
248 |
marker_mapping = {
|
249 |
"es-digital-paragraph-degradation-seq": "x",
|
250 |
"es-digital-line-degradation-seq": "cross",
|
|
|
264 |
group_label=source)
|
265 |
synthetic_renderers.update(renderers)
|
266 |
|
267 |
+
# Agregar el subset pretrained (se puede usar un marcador distinto, por ejemplo, "triangle")
|
268 |
+
pretrained_renderers = add_dataset_to_fig(fig, dfs["pretrained"], unique_subsets["pretrained"],
|
269 |
+
marker="triangle", color_mapping=color_maps["pretrained"],
|
270 |
+
group_label="Pretrained")
|
271 |
+
|
272 |
fig.legend.location = "top_right"
|
273 |
fig.legend.click_policy = "hide"
|
274 |
show_legend = st.checkbox("Show Legend", value=False, key=f"legend_{model_name}")
|
275 |
fig.legend.visible = show_legend
|
276 |
+
return fig, real_renderers, synthetic_renderers, pretrained_renderers
|
277 |
+
|
278 |
|
279 |
def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping, group_label):
|
280 |
renderers = {}
|
|
|
377 |
else:
|
378 |
palette = Blues9[:len(labels)] if len(labels) <= 9 else (Blues9 * ((len(labels)//9)+1))[:len(labels)]
|
379 |
color_map["synthetic"][source] = {label: palette[i] for i, label in enumerate(sorted(labels))}
|
380 |
+
|
381 |
+
# Asignar colores al subset pretrained usando, por ejemplo, la paleta Purples9
|
382 |
+
num_pretrained = len(unique_subsets["pretrained"])
|
383 |
+
purple_palette = Purples9[:num_pretrained] if num_pretrained <= 9 else (Purples9 * ((num_pretrained // 9) + 1))[:num_pretrained]
|
384 |
+
color_map["pretrained"] = {label: purple_palette[i] for i, label in enumerate(sorted(unique_subsets["pretrained"]))}
|
385 |
+
|
386 |
return color_map
|
387 |
|
388 |
+
|
389 |
def calculate_cluster_centers(df, labels):
|
390 |
centers = {}
|
391 |
for label in labels:
|
|
|
526 |
|
527 |
def run_model(model_name):
|
528 |
version = st.selectbox("Select Model Version:", options=["vanilla", "finetuned_real"], key=f"version_{model_name}")
|
529 |
+
# Nuevo selector para el c贸mputo del embedding
|
530 |
+
embedding_computation = st.selectbox("驴C贸mo se computa el embedding?", options=["weighted", "averaged"], key=f"embedding_method_{model_name}")
|
531 |
+
# Se asigna el prefijo correspondiente
|
532 |
+
prefijo_embedding = "weighted_" if embedding_computation == "weighted" else ""
|
533 |
|
534 |
+
embeddings = load_embeddings(model_name, version, prefijo_embedding)
|
535 |
if embeddings is None:
|
536 |
return
|
537 |
embedding_cols = [col for col in embeddings["real"].columns if col.startswith("dim_")]
|
|
|
607 |
reset_button = Button(label="Reset Colors", button_type="primary")
|
608 |
line_source = ColumnDataSource(data={'x': [], 'y': []})
|
609 |
|
|
|
|
|
610 |
if (reduction_method == "t-SNE" and N_COMPONENTS == 2) or (reduction_method == "PCA" and N_COMPONENTS == 2):
|
611 |
+
fig, real_renderers, synthetic_renderers, pretrained_renderers = create_figure(
|
612 |
+
result["dfs_reduced"],
|
613 |
+
result["unique_subsets"],
|
614 |
+
get_color_maps(result["unique_subsets"]),
|
615 |
+
model_name
|
616 |
+
)
|
617 |
fig.line('x', 'y', source=line_source, line_width=2, line_color='black')
|
618 |
centers_real = calculate_cluster_centers(result["dfs_reduced"]["real"], result["unique_subsets"]["real"])
|
619 |
real_centers_js = {k: [v[0], v[1]] for k, v in centers_real.items()}
|
|
|
681 |
run_model("Idefics2")
|
682 |
|
683 |
if __name__ == "__main__":
|
684 |
+
main()
|