de-Rodrigo commited on
Commit
f872421
1 Parent(s): 97ec291

Refactor Code

Browse files
Files changed (1) hide show
  1. app.py +128 -72
app.py CHANGED
@@ -17,6 +17,7 @@ TOOLTIPS = """
17
  </div>
18
  """
19
 
 
20
  def config_style():
21
  st.markdown("""
22
  <style>
@@ -25,17 +26,43 @@ def config_style():
25
  .custom-text { font-size: 18px; line-height: 1.5; }
26
  </style>
27
  """, unsafe_allow_html=True)
28
- st.markdown('<h1 class="main-title">Merit Secret Embeddings 馃帓馃搩馃弳</h1>', unsafe_allow_html=True)
29
- st.markdown('<h2 class="sub-title">Donut - Comparaci贸n de versiones</h2>', unsafe_allow_html=True)
30
  st.markdown(
31
  """
32
  <p class="custom-text">
33
  Se cargan ambas versiones de los embeddings y se aplica una reducci贸n dimensional sobre el conjunto combinado.
34
- Los puntos de la versi贸n vanilla se muestran como <strong>c铆rculos</strong> (tonos de rojo)
35
- y los de la v2 como <strong>cuadrados</strong> (tonos de azul).
36
  </p>
37
  """, unsafe_allow_html=True)
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping):
40
  for label in selected_labels:
41
  subset = df[df['label'] == label]
@@ -53,90 +80,119 @@ def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping):
53
  fill_color=color, line_color=color,
54
  legend_label=f"{label} (Real)")
55
  elif marker == "square":
56
- fig.square('x', 'y', size=10, source=source,
57
- fill_alpha=0, line_color=color,
58
  legend_label=f"{label} (Sint茅tico)")
59
 
60
- def main():
61
- config_style()
62
-
63
- st.markdown('<h2 class="sub-title">Carga y reducci贸n dimensional</h2>', unsafe_allow_html=True)
64
-
65
- # Cargar ambas versiones de los embeddings
66
- df_vanilla = pd.read_csv("data/donut_de_Rodrigo_merit_secret_all_embeddings.csv")
67
- df_v2 = pd.read_csv("data/donut_de_Rodrigo_merit_es-digital-seq_embeddings.csv")
68
-
69
- # Agregar una columna para identificar la versi贸n
70
- df_vanilla["version"] = "vanilla"
71
- df_v2["version"] = "v2"
72
-
73
- # Se asume que ambas versiones tienen columnas de embedding que comienzan con "dim_"
74
- embedding_cols = [col for col in df_vanilla.columns if col.startswith("dim_")]
75
-
76
- # Combinar ambos dataframes para que la reducci贸n se aplique sobre el conjunto completo
77
- df_combined = pd.concat([df_vanilla, df_v2], ignore_index=True)
78
 
79
- # Selecci贸n del m茅todo de reducci贸n dimensional
80
- reduction_method = st.selectbox("Seleccione m茅todo de reducci贸n:", options=["PCA", "t-SNE"])
81
- all_embeddings = df_combined[embedding_cols].values
82
- if reduction_method == "PCA":
83
- reducer = PCA(n_components=2)
84
  else:
85
- reducer = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate=200)
86
- reduced = reducer.fit_transform(all_embeddings)
87
 
88
- # Asignar las coordenadas resultantes al dataframe combinado
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  df_combined['x'] = reduced[:, 0]
90
  df_combined['y'] = reduced[:, 1]
91
-
92
- # Separar nuevamente seg煤n la versi贸n
93
- df_vanilla_trans = df_combined[df_combined["version"] == "vanilla"].copy()
94
- df_v2_trans = df_combined[df_combined["version"] == "v2"].copy()
95
 
96
  # Obtener los subsets 煤nicos de cada versi贸n
97
- unique_labels_vanilla = sorted(df_vanilla_trans['label'].unique().tolist())
98
- unique_labels_v2 = sorted(df_v2_trans['label'].unique().tolist())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- # Selectores para filtrar los subsets a visualizar
101
- selected_labels_vanilla = st.multiselect("Seleccione subsets para visualizar (Real):",
102
- options=unique_labels_vanilla,
103
- default=unique_labels_vanilla)
104
- selected_labels_v2 = st.multiselect("Seleccione subsets para visualizar (Sint茅tico):",
105
- options=unique_labels_v2,
106
- default=unique_labels_v2)
107
 
108
- # Generar mapeos de colores espec铆ficos:
109
- # Para vanilla se usar谩n tonos de rojo (paleta Reds9)
110
- num_vanilla = len(selected_labels_vanilla)
111
- if num_vanilla <= 9:
112
- red_palette = Reds9[:num_vanilla]
113
- else:
114
- red_palette = (Reds9 * ((num_vanilla // 9) + 1))[:num_vanilla]
115
- color_mapping_vanilla = {label: red_palette[i] for i, label in enumerate(sorted(selected_labels_vanilla))}
116
 
117
- # Para v2 se usar谩n tonos de azul (paleta Blues9)
118
- num_v2 = len(selected_labels_v2)
119
- if num_v2 <= 9:
120
- blue_palette = Blues9[:num_v2]
121
- else:
122
- blue_palette = (Blues9 * ((num_v2 // 9) + 1))[:num_v2]
123
- color_mapping_v2 = {label: blue_palette[i] for i, label in enumerate(sorted(selected_labels_v2))}
124
 
125
- # Crear una figura 煤nica para ambas versiones
126
- fig = figure(width=600, height=600, tooltips=TOOLTIPS,
127
- title="Donut: Muestras Reales (c铆rculos, rojos) vs Muestras Sint茅ticas (cuadrados, azules)")
128
 
129
- # Agregar datos de la versi贸n vanilla (c铆rculos con tonos de rojo)
130
- add_dataset_to_fig(fig, df_vanilla_trans, selected_labels_vanilla,
131
- marker="circle", color_mapping=color_mapping_vanilla)
132
- # Agregar datos de la versi贸n v2 (cuadrados sin relleno, tonos de azul)
133
- add_dataset_to_fig(fig, df_v2_trans, selected_labels_v2,
134
- marker="square", color_mapping=color_mapping_v2)
135
 
136
- fig.legend.location = "top_right"
137
- fig.legend.click_policy = "hide"
 
 
 
 
 
 
 
 
 
 
 
138
 
139
- st.bokeh_chart(fig)
140
 
141
  if __name__ == "__main__":
142
  main()
 
17
  </div>
18
  """
19
 
20
+
21
  def config_style():
22
  st.markdown("""
23
  <style>
 
26
  .custom-text { font-size: 18px; line-height: 1.5; }
27
  </style>
28
  """, unsafe_allow_html=True)
29
+ st.markdown('<h1 class="main-title">Merit Embeddings 馃帓馃搩馃弳</h1>', unsafe_allow_html=True)
30
+ st.markdown('<h2 class="sub-title">Donut 馃</h2>', unsafe_allow_html=True)
31
  st.markdown(
32
  """
33
  <p class="custom-text">
34
  Se cargan ambas versiones de los embeddings y se aplica una reducci贸n dimensional sobre el conjunto combinado.
35
+ Los puntos de la versi贸n real se muestran como <strong>c铆rculos</strong> (tonos de rojo)
36
+ y los de la es_digital_seq como <strong>cuadrados</strong> (tonos de azul).
37
  </p>
38
  """, unsafe_allow_html=True)
39
 
40
+
41
+ def load_embeddings():
42
+ df_real = pd.read_csv("data/donut_de_Rodrigo_merit_secret_all_embeddings.csv")
43
+ df_es_digital_seq = pd.read_csv("data/donut_de_Rodrigo_merit_es-digital-seq_embeddings.csv")
44
+
45
+ embeddings = {
46
+ "real": df_real,
47
+ "es-digital-seq": df_es_digital_seq
48
+ }
49
+
50
+ return embeddings
51
+
52
+
53
+ def reducer_selector(df_combined, embedding_cols):
54
+
55
+ reduction_method = st.selectbox("Seleccione m茅todo de reducci贸n:", options=["PCA", "t-SNE"])
56
+ all_embeddings = df_combined[embedding_cols].values
57
+ if reduction_method == "PCA":
58
+ reducer = PCA(n_components=2)
59
+ else:
60
+ reducer = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate=200)
61
+ reduced = reducer.fit_transform(all_embeddings)
62
+
63
+ return reduced
64
+
65
+
66
  def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping):
67
  for label in selected_labels:
68
  subset = df[df['label'] == label]
 
80
  fill_color=color, line_color=color,
81
  legend_label=f"{label} (Real)")
82
  elif marker == "square":
83
+ fig.square('x', 'y', size=4, source=source, fill_color=color, line_color=color,
 
84
  legend_label=f"{label} (Sint茅tico)")
85
 
86
+
87
+ def get_color_maps(selected_subsets: dict):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
+ # real
90
+ num_real = len(selected_subsets["real"])
91
+ if num_real <= 9:
92
+ red_palette = Reds9[:num_real]
 
93
  else:
94
+ red_palette = (Reds9 * ((num_real // 9) + 1))[:num_real]
95
+ color_mapping_real = {label: red_palette[i] for i, label in enumerate(sorted(selected_subsets["real"]))}
96
 
97
+ # es-digital-seq
98
+ num_es_digital_seq = len(selected_subsets["es-digital-seq"])
99
+ if num_es_digital_seq <= 9:
100
+ blue_palette = Blues9[:num_es_digital_seq]
101
+ else:
102
+ blue_palette = (Blues9 * ((num_es_digital_seq // 9) + 1))[:num_es_digital_seq]
103
+ color_mapping_es_digital_seq = {label: blue_palette[i] for i, label in enumerate(sorted(selected_subsets["es-digital-seq"]))}
104
+
105
+ # Gather color maps
106
+ color_maps = {
107
+ "real": color_mapping_real,
108
+ "es-digital-seq": color_mapping_es_digital_seq
109
+ }
110
+
111
+ return color_maps
112
+
113
+
114
+ def split_versions(df_combined, reduced):
115
+
116
  df_combined['x'] = reduced[:, 0]
117
  df_combined['y'] = reduced[:, 1]
118
+
119
+ df_real_reduced = df_combined[df_combined["version"] == "real"].copy()
120
+ df_es_digital_seq_reduced = df_combined[df_combined["version"] == "es_digital_seq"].copy()
 
121
 
122
  # Obtener los subsets 煤nicos de cada versi贸n
123
+ unique_subsets_real = sorted(df_real_reduced['label'].unique().tolist())
124
+ unique_subsets_es_digital_seq = sorted(df_es_digital_seq_reduced['label'].unique().tolist())
125
+
126
+ unique_subsets = {
127
+ "real": unique_subsets_real,
128
+ "es-digital-seq": unique_subsets_es_digital_seq,
129
+ }
130
+
131
+ dfs_reduced = {
132
+ "real": df_real_reduced,
133
+ "es-digital-seq": df_es_digital_seq_reduced,
134
+ }
135
+
136
+ return dfs_reduced, unique_subsets
137
+
138
+
139
+ def subset_selectors(unique_subsets: dict):
140
+
141
+ selected_subsets_real = st.multiselect("Seleccione subsets para visualizar (Real):",
142
+ options=unique_subsets["real"],
143
+ default=unique_subsets["real"])
144
+ selected_subsets_es_digital_seq = st.multiselect("Seleccione subsets para visualizar (Sint茅tico):",
145
+ options=unique_subsets["es-digital-seq"],
146
+ default=unique_subsets["es-digital-seq"])
147
+
148
+ selected_subsets = {
149
+ "real": selected_subsets_real,
150
+ "es-digital-seq": selected_subsets_es_digital_seq
151
+ }
152
+
153
+ return selected_subsets
154
+
155
+
156
+ def create_figure(dfs_reduced, selected_subsets: dict, color_maps: dict):
157
+
158
+ fig = figure(width=600, height=600, tooltips=TOOLTIPS,
159
+ title="")
160
 
161
+ add_dataset_to_fig(fig, dfs_reduced["real"], selected_subsets["real"],
162
+ marker="circle", color_mapping=color_maps["real"])
163
+ add_dataset_to_fig(fig, dfs_reduced["es-digital-seq"], selected_subsets["es-digital-seq"],
164
+ marker="square", color_mapping=color_maps["es-digital-seq"])
 
 
 
165
 
166
+ fig.legend.location = "top_right"
167
+ fig.legend.click_policy = "hide"
168
+
169
+ return fig
170
+
171
+
172
+ def main():
 
173
 
174
+ config_style()
 
 
 
 
 
 
175
 
176
+ embeddings_dfs = load_embeddings()
 
 
177
 
178
+ embeddings_dfs["real"]["version"] = "real"
179
+ embeddings_dfs["es-digital-seq"]["version"] = "es_digital_seq"
 
 
 
 
180
 
181
+ embedding_cols = [col for col in embeddings_dfs["real"].columns if col.startswith("dim_")]
182
+
183
+ # Combine dataframes to apply method reduction
184
+ df_combined = pd.concat([embeddings_dfs["real"], embeddings_dfs["es-digital-seq"]], ignore_index=True)
185
+
186
+ reduced = reducer_selector(df_combined, embedding_cols)
187
+
188
+ # Split back the different versions
189
+ dfs_reduced, unique_subsets = split_versions(df_combined, reduced)
190
+
191
+ selected_subsets = subset_selectors(unique_subsets)
192
+ color_maps = get_color_maps(selected_subsets)
193
+ figure = create_figure(dfs_reduced, selected_subsets, color_maps)
194
 
195
+ st.bokeh_chart(figure)
196
 
197
  if __name__ == "__main__":
198
  main()