de-Rodrigo commited on
Commit
6ee3759
1 Parent(s): 94c64c7

Include Different Dataset Versions and Fancy Display

Browse files
Files changed (1) hide show
  1. app.py +102 -125
app.py CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
2
  import pandas as pd
3
  from bokeh.plotting import figure
4
  from bokeh.models import ColumnDataSource
5
- from bokeh.palettes import Category10
6
  from sklearn.decomposition import PCA
7
  from sklearn.manifold import TSNE
8
 
@@ -17,149 +17,126 @@ TOOLTIPS = """
17
  </div>
18
  """
19
 
20
- def render_plot(selected_labels, df, plot_placeholder):
21
- if not selected_labels:
22
- st.write("No data to display. Please select at least one subset.")
23
- return
24
-
25
- filtered_data = df[df['label'].isin(selected_labels)]
26
- p = figure(width=400, height=400, tooltips=TOOLTIPS)
27
-
28
- num_labels = len(selected_labels)
29
- # Ajuste de la paleta
30
- if num_labels < 3:
31
- palette = Category10[3][:num_labels]
32
- elif num_labels in [3, 4, 5, 6, 7, 8, 9, 10]:
33
- palette = Category10[num_labels]
34
- else:
35
- palette = Category10[10][:num_labels]
36
-
37
- # Graficar cada label por separado
38
- for label, color in zip(selected_labels, palette):
39
- subset = filtered_data[filtered_data['label'] == label]
40
- source = ColumnDataSource(data=dict(
41
- x=subset['x'],
42
- y=subset['y'],
43
- label=subset['label'],
44
- img=subset['img']
45
- ))
46
- p.scatter('x', 'y', size=12, source=source, color=color, legend_label=label)
47
-
48
- p.legend.title = "Subsets"
49
- p.legend.location = "top_right"
50
- p.legend.click_policy = "hide"
51
-
52
- plot_placeholder.bokeh_chart(p)
53
-
54
  def config_style():
55
- st.markdown(
56
- """
57
  <style>
58
- .main-title {
59
- font-size: 50px;
60
- color: #4CAF50;
61
- text-align: center;
62
- }
63
- .sub-title {
64
- font-size: 30px;
65
- color: #555;
66
- }
67
- .custom-text {
68
- font-size: 18px;
69
- line-height: 1.5;
70
- }
71
  </style>
72
- """,
73
- unsafe_allow_html=True
74
- )
75
-
76
  st.markdown('<h1 class="main-title">Merit Secret Embeddings 馃帓馃搩馃弳</h1>', unsafe_allow_html=True)
77
- st.markdown('<h2 class="sub-title">Donut</h2>', unsafe_allow_html=True)
78
  st.markdown(
79
  """
80
  <p class="custom-text">
81
- Explore how Donut perceives real data.
 
 
82
  </p>
83
- """,
84
- unsafe_allow_html=True
85
- )
86
 
87
- if __name__ == "__main__":
88
- config_style()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- # --- Primer gr谩fico: datos de Donut ---
91
- # Se asume que "embeddings_donut.csv" contiene las columnas "dim_0", "dim_1", ..., "dim_N", adem谩s de "label" e "img"
92
- df_donut = pd.read_csv("data/donut_de_Rodrigo_merit_secret_all_embeddings.csv")
93
-
94
- # Selecci贸n de visualizaci贸n
95
- donut_mode = st.selectbox(
96
- "Seleccione visualizaci贸n para Donut:",
97
- options=["PCA", "t-SNE"]
98
- )
99
-
100
- # Extraer columnas de embedding (aquellas que empiezan con "dim_")
101
- embedding_cols = [col for col in df_donut.columns if col.startswith("dim_")]
102
- all_embeddings = df_donut[embedding_cols].values
103
-
104
- if donut_mode == "PCA":
105
- pca = PCA(n_components=2)
106
- reduced = pca.fit_transform(all_embeddings)
107
- else:
108
- tsne = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate=200)
109
- reduced = tsne.fit_transform(all_embeddings)
110
-
111
- # A帽adir las coordenadas resultantes al DataFrame
112
- df_donut['x'] = reduced[:, 0]
113
- df_donut['y'] = reduced[:, 1]
114
-
115
- unique_labels = df_donut['label'].unique().tolist()
116
- plot_placeholder = st.empty()
117
 
118
- # Mostrar gr谩fico inicial con todas las etiquetas
119
- render_plot(unique_labels, df_donut, plot_placeholder)
120
 
121
- # Desplegable para filtrar etiquetas
122
- selected_labels = st.multiselect(
123
- "Seleccione subsets para visualizar (Donut):",
124
- options=unique_labels,
125
- default=unique_labels
126
- )
127
- render_plot(selected_labels, df_donut, plot_placeholder)
128
 
129
- # --- Segundo gr谩fico: datos de Idefics2 ---
130
- st.markdown('<h2 class="sub-title">Idefics2</h2>', unsafe_allow_html=True)
 
131
 
132
- # Se asume que "embeddings_idefics2.csv" tiene la misma estructura
133
- df_idefics2 = pd.read_csv("data/embeddings_idefics2.csv")
134
 
135
- idefics2_mode = st.selectbox(
136
- "Seleccione visualizaci贸n para Idefics2:",
137
- options=["PCA", "t-SNE"],
138
- key="idefics2_mode"
139
- )
140
 
141
- embedding_cols2 = [col for col in df_idefics2.columns if col.startswith("dim_")]
142
- all_embeddings2 = df_idefics2[embedding_cols2].values
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
- if idefics2_mode == "PCA":
145
- pca2 = PCA(n_components=2)
146
- reduced2 = pca2.fit_transform(all_embeddings2)
 
147
  else:
148
- tsne2 = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate=200)
149
- reduced2 = tsne2.fit_transform(all_embeddings2)
150
 
151
- df_idefics2['x'] = reduced2[:, 0]
152
- df_idefics2['y'] = reduced2[:, 1]
 
153
 
154
- unique_labels2 = df_idefics2['label'].unique().tolist()
155
- plot_placeholder2 = st.empty()
 
 
 
 
156
 
157
- render_plot(unique_labels2, df_idefics2, plot_placeholder2)
 
158
 
159
- selected_labels2 = st.multiselect(
160
- "Seleccione subsets para visualizar (Idefics2):",
161
- options=unique_labels2,
162
- default=unique_labels2,
163
- key="idefics2"
164
- )
165
- render_plot(selected_labels2, df_idefics2, plot_placeholder2)
 
2
  import pandas as pd
3
  from bokeh.plotting import figure
4
  from bokeh.models import ColumnDataSource
5
+ from bokeh.palettes import Reds9, Blues9
6
  from sklearn.decomposition import PCA
7
  from sklearn.manifold import TSNE
8
 
 
17
  </div>
18
  """
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def config_style():
21
+ st.markdown("""
 
22
  <style>
23
+ .main-title { font-size: 50px; color: #4CAF50; text-align: center; }
24
+ .sub-title { font-size: 30px; color: #555; }
25
+ .custom-text { font-size: 18px; line-height: 1.5; }
 
 
 
 
 
 
 
 
 
 
26
  </style>
27
+ """, unsafe_allow_html=True)
 
 
 
28
  st.markdown('<h1 class="main-title">Merit Secret Embeddings 馃帓馃搩馃弳</h1>', unsafe_allow_html=True)
29
+ st.markdown('<h2 class="sub-title">Donut - Comparaci贸n de versiones</h2>', unsafe_allow_html=True)
30
  st.markdown(
31
  """
32
  <p class="custom-text">
33
+ Se cargan ambas versiones de los embeddings y se aplica una reducci贸n dimensional sobre el conjunto combinado.
34
+ Los puntos de la versi贸n vanilla se muestran como <strong>c铆rculos</strong> (tonos de rojo)
35
+ y los de la v2 como <strong>cuadrados</strong> (tonos de azul).
36
  </p>
37
+ """, unsafe_allow_html=True)
 
 
38
 
39
+ def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping):
40
+ for label in selected_labels:
41
+ subset = df[df['label'] == label]
42
+ if subset.empty:
43
+ continue
44
+ source = ColumnDataSource(data=dict(
45
+ x = subset['x'],
46
+ y = subset['y'],
47
+ label = subset['label'],
48
+ img = subset['img']
49
+ ))
50
+ color = color_mapping[label]
51
+ if marker == "circle":
52
+ fig.circle('x', 'y', size=10, source=source,
53
+ fill_color=color, line_color=color,
54
+ legend_label=f"{label} (vanilla)")
55
+ elif marker == "square":
56
+ fig.square('x', 'y', size=10, source=source,
57
+ fill_alpha=0, line_color=color,
58
+ legend_label=f"{label} (v2)")
59
 
60
+ def main():
61
+ config_style()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ st.markdown('<h2 class="sub-title">Carga y reducci贸n dimensional</h2>', unsafe_allow_html=True)
 
64
 
65
+ # Cargar ambas versiones de los embeddings
66
+ df_vanilla = pd.read_csv("data/donut_de_Rodrigo_merit_secret_all_embeddings.csv")
67
+ df_v2 = pd.read_csv("data/donut_de_Rodrigo_merit_es-digital-seq_embeddings.csv")
 
 
 
 
68
 
69
+ # Agregar una columna para identificar la versi贸n
70
+ df_vanilla["version"] = "vanilla"
71
+ df_v2["version"] = "v2"
72
 
73
+ # Se asume que ambas versiones tienen columnas de embedding que comienzan con "dim_"
74
+ embedding_cols = [col for col in df_vanilla.columns if col.startswith("dim_")]
75
 
76
+ # Combinar ambos dataframes para que la reducci贸n se aplique sobre el conjunto completo
77
+ df_combined = pd.concat([df_vanilla, df_v2], ignore_index=True)
 
 
 
78
 
79
+ # Selecci贸n del m茅todo de reducci贸n dimensional
80
+ reduction_method = st.selectbox("Seleccione m茅todo de reducci贸n:", options=["PCA", "t-SNE"])
81
+ all_embeddings = df_combined[embedding_cols].values
82
+ if reduction_method == "PCA":
83
+ reducer = PCA(n_components=2)
84
+ else:
85
+ reducer = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate=200)
86
+ reduced = reducer.fit_transform(all_embeddings)
87
+
88
+ # Asignar las coordenadas resultantes al dataframe combinado
89
+ df_combined['x'] = reduced[:, 0]
90
+ df_combined['y'] = reduced[:, 1]
91
+
92
+ # Separar nuevamente seg煤n la versi贸n
93
+ df_vanilla_trans = df_combined[df_combined["version"] == "vanilla"].copy()
94
+ df_v2_trans = df_combined[df_combined["version"] == "v2"].copy()
95
+
96
+ # Obtener los subsets 煤nicos de cada versi贸n
97
+ unique_labels_vanilla = sorted(df_vanilla_trans['label'].unique().tolist())
98
+ unique_labels_v2 = sorted(df_v2_trans['label'].unique().tolist())
99
+
100
+ # Selectores para filtrar los subsets a visualizar
101
+ selected_labels_vanilla = st.multiselect("Seleccione subsets para visualizar (Vanilla):",
102
+ options=unique_labels_vanilla,
103
+ default=unique_labels_vanilla)
104
+ selected_labels_v2 = st.multiselect("Seleccione subsets para visualizar (v2):",
105
+ options=unique_labels_v2,
106
+ default=unique_labels_v2)
107
+
108
+ # Generar mapeos de colores espec铆ficos:
109
+ # Para vanilla se usar谩n tonos de rojo (paleta Reds9)
110
+ num_vanilla = len(selected_labels_vanilla)
111
+ if num_vanilla <= 9:
112
+ red_palette = Reds9[:num_vanilla]
113
+ else:
114
+ red_palette = (Reds9 * ((num_vanilla // 9) + 1))[:num_vanilla]
115
+ color_mapping_vanilla = {label: red_palette[i] for i, label in enumerate(sorted(selected_labels_vanilla))}
116
 
117
+ # Para v2 se usar谩n tonos de azul (paleta Blues9)
118
+ num_v2 = len(selected_labels_v2)
119
+ if num_v2 <= 9:
120
+ blue_palette = Blues9[:num_v2]
121
  else:
122
+ blue_palette = (Blues9 * ((num_v2 // 9) + 1))[:num_v2]
123
+ color_mapping_v2 = {label: blue_palette[i] for i, label in enumerate(sorted(selected_labels_v2))}
124
 
125
+ # Crear una figura 煤nica para ambas versiones
126
+ fig = figure(width=600, height=600, tooltips=TOOLTIPS,
127
+ title="Donut: Vanilla (c铆rculos, rojos) vs v2 (cuadrados, azules)")
128
 
129
+ # Agregar datos de la versi贸n vanilla (c铆rculos con tonos de rojo)
130
+ add_dataset_to_fig(fig, df_vanilla_trans, selected_labels_vanilla,
131
+ marker="circle", color_mapping=color_mapping_vanilla)
132
+ # Agregar datos de la versi贸n v2 (cuadrados sin relleno, tonos de azul)
133
+ add_dataset_to_fig(fig, df_v2_trans, selected_labels_v2,
134
+ marker="square", color_mapping=color_mapping_v2)
135
 
136
+ fig.legend.location = "top_right"
137
+ fig.legend.click_policy = "hide"
138
 
139
+ st.bokeh_chart(fig)
140
+
141
+ if __name__ == "__main__":
142
+ main()