File size: 6,903 Bytes
9392036
 
913507e
7dae805
6ee3759
5498932
 
1280fd8
9392036
1280fd8
9392036
2ee3fae
9392036
 
 
 
1280fd8
 
913507e
f872421
574aa10
6ee3759
574aa10
6ee3759
 
 
574aa10
6ee3759
f872421
 
574aa10
 
 
6ee3759
f872421
 
574aa10
6ee3759
574aa10
f872421
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ee3759
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97ec291
6ee3759
f872421
97ec291
574aa10
f872421
 
c6959dd
f872421
 
 
 
6ee3759
f872421
 
6ee3759
f872421
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ee3759
 
f872421
 
 
6ee3759
 
f872421
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ee3759
f872421
 
 
 
6ee3759
f872421
 
 
 
 
 
 
5498932
f872421
5498932
f872421
5498932
f872421
 
5498932
f872421
 
 
 
 
 
 
 
 
 
 
 
 
5498932
f872421
6ee3759
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import streamlit as st
import pandas as pd
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.palettes import Reds9, Blues9
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

TOOLTIPS = """
<div>
    <div>
        <img src="@img{safe}" style="width:128px; height:auto; float: left; margin: 0px 15px 15px 0px;" alt="@img" border="2"></img>
    </div>
    <div>
        <span style="font-size: 17px; font-weight: bold;">@label</span>
    </div>
</div>
"""


def config_style():
    st.markdown("""
        <style>
        .main-title { font-size: 50px; color: #4CAF50; text-align: center; }
        .sub-title { font-size: 30px; color: #555; }
        .custom-text { font-size: 18px; line-height: 1.5; }
        </style>
    """, unsafe_allow_html=True)
    st.markdown('<h1 class="main-title">Merit Embeddings 馃帓馃搩馃弳</h1>', unsafe_allow_html=True)
    st.markdown('<h2 class="sub-title">Donut 馃</h2>', unsafe_allow_html=True)
    st.markdown(
        """
        <p class="custom-text">
        Se cargan ambas versiones de los embeddings y se aplica una reducci贸n dimensional sobre el conjunto combinado.
        Los puntos de la versi贸n real se muestran como <strong>c铆rculos</strong> (tonos de rojo)
        y los de la es_digital_seq como <strong>cuadrados</strong> (tonos de azul).
        </p>
        """, unsafe_allow_html=True)


def load_embeddings():
    df_real = pd.read_csv("data/donut_de_Rodrigo_merit_secret_all_embeddings.csv")
    df_es_digital_seq = pd.read_csv("data/donut_de_Rodrigo_merit_es-digital-seq_embeddings.csv")

    embeddings = {
        "real": df_real,
        "es-digital-seq": df_es_digital_seq
    }

    return embeddings


def reducer_selector(df_combined, embedding_cols):

    reduction_method = st.selectbox("Seleccione m茅todo de reducci贸n:", options=["PCA", "t-SNE"])
    all_embeddings = df_combined[embedding_cols].values
    if reduction_method == "PCA":
        reducer = PCA(n_components=2)
    else:
        reducer = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate=200)
    reduced = reducer.fit_transform(all_embeddings)

    return reduced


def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping):
    for label in selected_labels:
        subset = df[df['label'] == label]
        if subset.empty:
            continue
        source = ColumnDataSource(data=dict(
            x = subset['x'],
            y = subset['y'],
            label = subset['label'],
            img = subset['img']
        ))
        color = color_mapping[label]
        if marker == "circle":
            fig.circle('x', 'y', size=10, source=source,
                       fill_color=color, line_color=color,
                       legend_label=f"{label} (Real)")
        elif marker == "square":
            fig.square('x', 'y', size=4, source=source, fill_color=color, line_color=color,
                       legend_label=f"{label} (Sint茅tico)")


def get_color_maps(selected_subsets: dict):
    
    # real
    num_real = len(selected_subsets["real"])
    if num_real <= 9:
        red_palette = Reds9[:num_real]
    else:
        red_palette = (Reds9 * ((num_real // 9) + 1))[:num_real]
    color_mapping_real = {label: red_palette[i] for i, label in enumerate(sorted(selected_subsets["real"]))}
    
    # es-digital-seq
    num_es_digital_seq = len(selected_subsets["es-digital-seq"])
    if num_es_digital_seq <= 9:
        blue_palette = Blues9[:num_es_digital_seq]
    else:
        blue_palette = (Blues9 * ((num_es_digital_seq // 9) + 1))[:num_es_digital_seq]
    color_mapping_es_digital_seq = {label: blue_palette[i] for i, label in enumerate(sorted(selected_subsets["es-digital-seq"]))}

    # Gather color maps
    color_maps = {
        "real": color_mapping_real,
        "es-digital-seq": color_mapping_es_digital_seq
    }

    return color_maps


def split_versions(df_combined, reduced):

    df_combined['x'] = reduced[:, 0]
    df_combined['y'] = reduced[:, 1]

    df_real_reduced = df_combined[df_combined["version"] == "real"].copy()
    df_es_digital_seq_reduced = df_combined[df_combined["version"] == "es_digital_seq"].copy()
    
    # Obtener los subsets 煤nicos de cada versi贸n
    unique_subsets_real = sorted(df_real_reduced['label'].unique().tolist())
    unique_subsets_es_digital_seq = sorted(df_es_digital_seq_reduced['label'].unique().tolist())

    unique_subsets = {
        "real": unique_subsets_real,
        "es-digital-seq": unique_subsets_es_digital_seq,
    }

    dfs_reduced = {
        "real": df_real_reduced,
        "es-digital-seq": df_es_digital_seq_reduced,
    }

    return dfs_reduced, unique_subsets


def subset_selectors(unique_subsets: dict):

    selected_subsets_real = st.multiselect("Seleccione subsets para visualizar (Real):",
                                             options=unique_subsets["real"],
                                             default=unique_subsets["real"])
    selected_subsets_es_digital_seq = st.multiselect("Seleccione subsets para visualizar (Sint茅tico):",
                                        options=unique_subsets["es-digital-seq"],
                                        default=unique_subsets["es-digital-seq"])
    
    selected_subsets = {
        "real": selected_subsets_real,
        "es-digital-seq": selected_subsets_es_digital_seq
    }

    return selected_subsets


def create_figure(dfs_reduced, selected_subsets: dict, color_maps: dict):

    fig = figure(width=600, height=600, tooltips=TOOLTIPS,
                 title="")
    
    add_dataset_to_fig(fig, dfs_reduced["real"], selected_subsets["real"],
                       marker="circle", color_mapping=color_maps["real"])
    add_dataset_to_fig(fig, dfs_reduced["es-digital-seq"], selected_subsets["es-digital-seq"],
                       marker="square", color_mapping=color_maps["es-digital-seq"])
    
    fig.legend.location = "top_right"
    fig.legend.click_policy = "hide"

    return fig


def main():
    
    config_style()
    
    embeddings_dfs = load_embeddings()
    
    embeddings_dfs["real"]["version"] = "real"
    embeddings_dfs["es-digital-seq"]["version"] = "es_digital_seq"
    
    embedding_cols = [col for col in embeddings_dfs["real"].columns if col.startswith("dim_")]
    
    # Combine dataframes to apply method reduction
    df_combined = pd.concat([embeddings_dfs["real"], embeddings_dfs["es-digital-seq"]], ignore_index=True)

    reduced = reducer_selector(df_combined, embedding_cols)
    
    # Split back the different versions
    dfs_reduced, unique_subsets = split_versions(df_combined, reduced)

    selected_subsets = subset_selectors(unique_subsets)
    color_maps = get_color_maps(selected_subsets)
    figure = create_figure(dfs_reduced, selected_subsets, color_maps)
    
    st.bokeh_chart(figure)

if __name__ == "__main__":
    main()