Spaces:
Running
Running
Commit
路
f541667
1
Parent(s):
b5f38e3
Vanilla or Overfitted Model Selection
Browse files
app.py
CHANGED
@@ -40,15 +40,15 @@ def config_style():
|
|
40 |
# Funciones de carga de datos, generaci贸n de gr谩ficos y c谩lculo de distancias (sin cambios)
|
41 |
# =============================================================================
|
42 |
|
43 |
-
def load_embeddings(model):
|
44 |
if model == "Donut":
|
45 |
-
df_real = pd.read_csv("data/
|
46 |
-
df_par = pd.read_csv("data/
|
47 |
-
df_line = pd.read_csv("data/
|
48 |
-
df_seq = pd.read_csv("data/
|
49 |
-
df_rot = pd.read_csv("data/
|
50 |
-
df_zoom
|
51 |
-
df_render
|
52 |
df_real["version"] = "real"
|
53 |
df_par["version"] = "synthetic"
|
54 |
df_line["version"] = "synthetic"
|
@@ -66,13 +66,13 @@ def load_embeddings(model):
|
|
66 |
return {"real": df_real, "synthetic": pd.concat([df_seq, df_line, df_par, df_rot, df_zoom, df_render], ignore_index=True)}
|
67 |
|
68 |
elif model == "Idefics2":
|
69 |
-
df_real = pd.read_csv("data/
|
70 |
-
df_par = pd.read_csv("data/
|
71 |
-
df_line = pd.read_csv("data/
|
72 |
-
df_seq = pd.read_csv("data/
|
73 |
-
df_rot = pd.read_csv("data/
|
74 |
-
df_zoom
|
75 |
-
df_render
|
76 |
df_real["version"] = "real"
|
77 |
df_par["version"] = "synthetic"
|
78 |
df_line["version"] = "synthetic"
|
@@ -266,16 +266,12 @@ def add_synthetic_dataset_to_fig(fig, df, labels, marker, color_mapping, group_l
|
|
266 |
renderers[label + f" ({group_label})"] = r
|
267 |
return renderers
|
268 |
|
269 |
-
|
270 |
-
|
271 |
def get_color_maps(unique_subsets):
|
272 |
color_map = {}
|
273 |
-
# Para reales se asigna color para cada etiqueta
|
274 |
num_real = len(unique_subsets["real"])
|
275 |
red_palette = Reds9[:num_real] if num_real <= 9 else (Reds9 * ((num_real // 9) + 1))[:num_real]
|
276 |
color_map["real"] = {label: red_palette[i] for i, label in enumerate(sorted(unique_subsets["real"]))}
|
277 |
|
278 |
-
# Para sint茅ticos se asigna color de forma granular: para cada source se mapea cada etiqueta
|
279 |
color_map["synthetic"] = {}
|
280 |
for source, labels in unique_subsets["synthetic"].items():
|
281 |
if source == "es-digital-seq":
|
@@ -294,8 +290,7 @@ def get_color_maps(unique_subsets):
|
|
294 |
palette = Blues9[:len(labels)] if len(labels) <= 9 else (Blues9 * ((len(labels)//9)+1))[:len(labels)]
|
295 |
color_map["synthetic"][source] = {label: palette[i] for i, label in enumerate(sorted(labels))}
|
296 |
return color_map
|
297 |
-
|
298 |
-
|
299 |
def calculate_cluster_centers(df, labels):
|
300 |
centers = {}
|
301 |
for label in labels:
|
@@ -304,14 +299,11 @@ def calculate_cluster_centers(df, labels):
|
|
304 |
centers[label] = (subset['x'].mean(), subset['y'].mean())
|
305 |
return centers
|
306 |
|
307 |
-
|
308 |
-
|
309 |
# =============================================================================
|
310 |
# Funci贸n centralizada para la pipeline: reducci贸n, distancias y regresi贸n global
|
311 |
# =============================================================================
|
312 |
|
313 |
def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method="t-SNE"):
|
314 |
-
# Seleccionar el reductor seg煤n el m茅todo
|
315 |
if reduction_method == "PCA":
|
316 |
reducer = PCA(n_components=2)
|
317 |
else:
|
@@ -319,25 +311,21 @@ def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, r
|
|
319 |
perplexity=tsne_params["perplexity"],
|
320 |
learning_rate=tsne_params["learning_rate"])
|
321 |
|
322 |
-
# Aplicar reducci贸n dimensional
|
323 |
reduced = reducer.fit_transform(df_combined[embedding_cols].values)
|
324 |
dfs_reduced, unique_subsets = split_versions(df_combined, reduced)
|
325 |
|
326 |
-
# Calcular distancias Wasserstein
|
327 |
df_distances = compute_wasserstein_distances_synthetic_individual(
|
328 |
dfs_reduced["synthetic"],
|
329 |
dfs_reduced["real"],
|
330 |
unique_subsets["real"]
|
331 |
)
|
332 |
|
333 |
-
# Extraer valores globales para cada fuente (se esperan 10 por fuente)
|
334 |
global_distances = {}
|
335 |
for idx in df_distances.index:
|
336 |
if idx.startswith("Global"):
|
337 |
source = idx.split("(")[1].rstrip(")")
|
338 |
global_distances[source] = df_distances.loc[idx].values
|
339 |
|
340 |
-
# Acumular todos los puntos (globales) y sus correspondientes f1 de cada colegio
|
341 |
all_x = []
|
342 |
all_y = []
|
343 |
for source in df_f1.columns:
|
@@ -349,13 +337,11 @@ def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, r
|
|
349 |
all_x_arr = np.array(all_x).reshape(-1, 1)
|
350 |
all_y_arr = np.array(all_y)
|
351 |
|
352 |
-
# Realizar regresi贸n lineal global
|
353 |
model_global = LinearRegression().fit(all_x_arr, all_y_arr)
|
354 |
r2 = model_global.score(all_x_arr, all_y_arr)
|
355 |
slope = model_global.coef_[0]
|
356 |
intercept = model_global.intercept_
|
357 |
|
358 |
-
# Crear scatter plot para visualizar la relaci贸n
|
359 |
scatter_fig = figure(width=600, height=600, tools="pan,wheel_zoom,reset,save",
|
360 |
title="Scatter Plot: Wasserstein vs F1")
|
361 |
source_colors = {
|
@@ -383,7 +369,6 @@ def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, r
|
|
383 |
hover_tool = HoverTool(tooltips=[("Wass. Distance", "@x"), ("f1", "@y"), ("Subset", "@Fuente")])
|
384 |
scatter_fig.add_tools(hover_tool)
|
385 |
|
386 |
-
# L铆nea de regresi贸n global
|
387 |
x_line = np.linspace(all_x_arr.min(), all_x_arr.max(), 100)
|
388 |
y_line = model_global.predict(x_line.reshape(-1, 1))
|
389 |
scatter_fig.line(x_line, y_line, line_width=2, line_color="black", legend_label="Global Regression")
|
@@ -399,11 +384,10 @@ def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, r
|
|
399 |
}
|
400 |
|
401 |
# =============================================================================
|
402 |
-
# Funci贸n de optimizaci贸n (grid search) para TSNE,
|
403 |
# =============================================================================
|
404 |
|
405 |
def optimize_tsne_params(df_combined, embedding_cols, df_f1):
|
406 |
-
# Rango de b煤squeda
|
407 |
perplexity_range = np.linspace(30, 50, 10)
|
408 |
learning_rate_range = np.linspace(200, 1000, 20)
|
409 |
|
@@ -432,17 +416,19 @@ def optimize_tsne_params(df_combined, embedding_cols, df_f1):
|
|
432 |
return best_params, best_R2
|
433 |
|
434 |
# =============================================================================
|
435 |
-
# Funci贸n principal run_model que integra
|
436 |
# =============================================================================
|
437 |
|
438 |
def run_model(model_name):
|
439 |
-
|
|
|
|
|
|
|
440 |
if embeddings is None:
|
441 |
return
|
442 |
embedding_cols = [col for col in embeddings["real"].columns if col.startswith("dim_")]
|
443 |
df_combined = pd.concat(list(embeddings.values()), ignore_index=True)
|
444 |
|
445 |
-
# Cargar CSV f1-donut
|
446 |
try:
|
447 |
df_f1 = pd.read_csv("data/f1-donut.csv", sep=';', index_col=0)
|
448 |
except Exception as e:
|
@@ -457,7 +443,7 @@ def run_model(model_name):
|
|
457 |
if st.button("Optimize TSNE parameters", key=f"optimize_tsne_{model_name}"):
|
458 |
st.info("Running optimization, this can take a while...")
|
459 |
best_params, best_R2 = optimize_tsne_params(df_combined, embedding_cols, df_f1)
|
460 |
-
st.success(f"
|
461 |
tsne_params = {"perplexity": best_params[0], "learning_rate": best_params[1]}
|
462 |
else:
|
463 |
perplexity_val = st.number_input(
|
@@ -481,7 +467,6 @@ def run_model(model_name):
|
|
481 |
tsne_params = {"perplexity": perplexity_val, "learning_rate": learning_rate_val}
|
482 |
# Si se selecciona PCA, tsne_params no se usa.
|
483 |
|
484 |
-
# Usar la funci贸n centralizada para obtener la regresi贸n global y el scatter plot
|
485 |
result = compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method=reduction_method)
|
486 |
|
487 |
reg_metrics = pd.DataFrame({
|
@@ -491,14 +476,12 @@ def run_model(model_name):
|
|
491 |
})
|
492 |
st.table(reg_metrics)
|
493 |
|
494 |
-
# No llamamos a st.bokeh_chart(result["scatter_fig"], ...) aqu铆
|
495 |
-
# Sino que combinamos todo en un 煤nico layout:
|
496 |
data_table, df_table, source_table = create_table(result["df_distances"])
|
497 |
real_subset_names = list(df_table.columns[1:])
|
498 |
real_select = Select(title="", value=real_subset_names[0], options=real_subset_names)
|
499 |
reset_button = Button(label="Reset Colors", button_type="primary")
|
500 |
line_source = ColumnDataSource(data={'x': [], 'y': []})
|
501 |
-
|
502 |
fig, real_renderers, synthetic_renderers = create_figure(result["dfs_reduced"], result["unique_subsets"], get_color_maps(result["unique_subsets"]), model_name)
|
503 |
fig.line('x', 'y', source=line_source, line_width=2, line_color='black')
|
504 |
centers_real = calculate_cluster_centers(result["dfs_reduced"]["real"], result["unique_subsets"]["real"])
|
@@ -543,7 +526,6 @@ def run_model(model_name):
|
|
543 |
df_table.to_excel(buffer, index=False)
|
544 |
buffer.seek(0)
|
545 |
|
546 |
-
# Combinar todos los gr谩ficos en un 煤nico layout:
|
547 |
layout = column(fig, result["scatter_fig"], column(real_select, reset_button, data_table))
|
548 |
st.bokeh_chart(layout, use_container_width=True)
|
549 |
|
|
|
40 |
# Funciones de carga de datos, generaci贸n de gr谩ficos y c谩lculo de distancias (sin cambios)
|
41 |
# =============================================================================
|
42 |
|
43 |
+
def load_embeddings(model, version):
|
44 |
if model == "Donut":
|
45 |
+
df_real = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_secret_all_embeddings.csv")
|
46 |
+
df_par = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-digital-paragraph-degradation-seq_embeddings.csv")
|
47 |
+
df_line = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-digital-line-degradation-seq_embeddings.csv")
|
48 |
+
df_seq = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-digital-seq_embeddings.csv")
|
49 |
+
df_rot = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-digital-rotation-degradation-seq_embeddings.csv")
|
50 |
+
df_zoom = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-digital-zoom-degradation-seq_embeddings.csv")
|
51 |
+
df_render = pd.read_csv(f"data/donut_{version}_de_Rodrigo_merit_es-render-seq_embeddings.csv")
|
52 |
df_real["version"] = "real"
|
53 |
df_par["version"] = "synthetic"
|
54 |
df_line["version"] = "synthetic"
|
|
|
66 |
return {"real": df_real, "synthetic": pd.concat([df_seq, df_line, df_par, df_rot, df_zoom, df_render], ignore_index=True)}
|
67 |
|
68 |
elif model == "Idefics2":
|
69 |
+
df_real = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_secret_britanico_embeddings.csv")
|
70 |
+
df_par = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-digital-paragraph-degradation-seq_embeddings.csv")
|
71 |
+
df_line = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-digital-line-degradation-seq_embeddings.csv")
|
72 |
+
df_seq = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-digital-seq_embeddings.csv")
|
73 |
+
df_rot = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-digital-rotation-degradation-seq_embeddings.csv")
|
74 |
+
df_zoom = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-digital-zoom-degradation-seq_embeddings.csv")
|
75 |
+
df_render = pd.read_csv(f"data/idefics2_{version}_de_Rodrigo_merit_es-render-seq_embeddings.csv")
|
76 |
df_real["version"] = "real"
|
77 |
df_par["version"] = "synthetic"
|
78 |
df_line["version"] = "synthetic"
|
|
|
266 |
renderers[label + f" ({group_label})"] = r
|
267 |
return renderers
|
268 |
|
|
|
|
|
269 |
def get_color_maps(unique_subsets):
|
270 |
color_map = {}
|
|
|
271 |
num_real = len(unique_subsets["real"])
|
272 |
red_palette = Reds9[:num_real] if num_real <= 9 else (Reds9 * ((num_real // 9) + 1))[:num_real]
|
273 |
color_map["real"] = {label: red_palette[i] for i, label in enumerate(sorted(unique_subsets["real"]))}
|
274 |
|
|
|
275 |
color_map["synthetic"] = {}
|
276 |
for source, labels in unique_subsets["synthetic"].items():
|
277 |
if source == "es-digital-seq":
|
|
|
290 |
palette = Blues9[:len(labels)] if len(labels) <= 9 else (Blues9 * ((len(labels)//9)+1))[:len(labels)]
|
291 |
color_map["synthetic"][source] = {label: palette[i] for i, label in enumerate(sorted(labels))}
|
292 |
return color_map
|
293 |
+
|
|
|
294 |
def calculate_cluster_centers(df, labels):
|
295 |
centers = {}
|
296 |
for label in labels:
|
|
|
299 |
centers[label] = (subset['x'].mean(), subset['y'].mean())
|
300 |
return centers
|
301 |
|
|
|
|
|
302 |
# =============================================================================
|
303 |
# Funci贸n centralizada para la pipeline: reducci贸n, distancias y regresi贸n global
|
304 |
# =============================================================================
|
305 |
|
306 |
def compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method="t-SNE"):
|
|
|
307 |
if reduction_method == "PCA":
|
308 |
reducer = PCA(n_components=2)
|
309 |
else:
|
|
|
311 |
perplexity=tsne_params["perplexity"],
|
312 |
learning_rate=tsne_params["learning_rate"])
|
313 |
|
|
|
314 |
reduced = reducer.fit_transform(df_combined[embedding_cols].values)
|
315 |
dfs_reduced, unique_subsets = split_versions(df_combined, reduced)
|
316 |
|
|
|
317 |
df_distances = compute_wasserstein_distances_synthetic_individual(
|
318 |
dfs_reduced["synthetic"],
|
319 |
dfs_reduced["real"],
|
320 |
unique_subsets["real"]
|
321 |
)
|
322 |
|
|
|
323 |
global_distances = {}
|
324 |
for idx in df_distances.index:
|
325 |
if idx.startswith("Global"):
|
326 |
source = idx.split("(")[1].rstrip(")")
|
327 |
global_distances[source] = df_distances.loc[idx].values
|
328 |
|
|
|
329 |
all_x = []
|
330 |
all_y = []
|
331 |
for source in df_f1.columns:
|
|
|
337 |
all_x_arr = np.array(all_x).reshape(-1, 1)
|
338 |
all_y_arr = np.array(all_y)
|
339 |
|
|
|
340 |
model_global = LinearRegression().fit(all_x_arr, all_y_arr)
|
341 |
r2 = model_global.score(all_x_arr, all_y_arr)
|
342 |
slope = model_global.coef_[0]
|
343 |
intercept = model_global.intercept_
|
344 |
|
|
|
345 |
scatter_fig = figure(width=600, height=600, tools="pan,wheel_zoom,reset,save",
|
346 |
title="Scatter Plot: Wasserstein vs F1")
|
347 |
source_colors = {
|
|
|
369 |
hover_tool = HoverTool(tooltips=[("Wass. Distance", "@x"), ("f1", "@y"), ("Subset", "@Fuente")])
|
370 |
scatter_fig.add_tools(hover_tool)
|
371 |
|
|
|
372 |
x_line = np.linspace(all_x_arr.min(), all_x_arr.max(), 100)
|
373 |
y_line = model_global.predict(x_line.reshape(-1, 1))
|
374 |
scatter_fig.line(x_line, y_line, line_width=2, line_color="black", legend_label="Global Regression")
|
|
|
384 |
}
|
385 |
|
386 |
# =============================================================================
|
387 |
+
# Funci贸n de optimizaci贸n (grid search) para TSNE, usando la misma pipeline
|
388 |
# =============================================================================
|
389 |
|
390 |
def optimize_tsne_params(df_combined, embedding_cols, df_f1):
|
|
|
391 |
perplexity_range = np.linspace(30, 50, 10)
|
392 |
learning_rate_range = np.linspace(200, 1000, 20)
|
393 |
|
|
|
416 |
return best_params, best_R2
|
417 |
|
418 |
# =============================================================================
|
419 |
+
# Funci贸n principal run_model que integra optimizaci贸n, selector de versi贸n y ejecuci贸n manual
|
420 |
# =============================================================================
|
421 |
|
422 |
def run_model(model_name):
|
423 |
+
# Seleccionar la versi贸n del modelo
|
424 |
+
version = st.selectbox("Select Model Version:", options=["vanilla", "finetuned_real"], key=f"version_{model_name}")
|
425 |
+
|
426 |
+
embeddings = load_embeddings(model_name, version)
|
427 |
if embeddings is None:
|
428 |
return
|
429 |
embedding_cols = [col for col in embeddings["real"].columns if col.startswith("dim_")]
|
430 |
df_combined = pd.concat(list(embeddings.values()), ignore_index=True)
|
431 |
|
|
|
432 |
try:
|
433 |
df_f1 = pd.read_csv("data/f1-donut.csv", sep=';', index_col=0)
|
434 |
except Exception as e:
|
|
|
443 |
if st.button("Optimize TSNE parameters", key=f"optimize_tsne_{model_name}"):
|
444 |
st.info("Running optimization, this can take a while...")
|
445 |
best_params, best_R2 = optimize_tsne_params(df_combined, embedding_cols, df_f1)
|
446 |
+
st.success(f"Best parameters: Perplexity = {best_params[0]:.2f}, Learning Rate = {best_params[1]:.2f} with R虏 = {best_R2:.4f}")
|
447 |
tsne_params = {"perplexity": best_params[0], "learning_rate": best_params[1]}
|
448 |
else:
|
449 |
perplexity_val = st.number_input(
|
|
|
467 |
tsne_params = {"perplexity": perplexity_val, "learning_rate": learning_rate_val}
|
468 |
# Si se selecciona PCA, tsne_params no se usa.
|
469 |
|
|
|
470 |
result = compute_global_regression(df_combined, embedding_cols, tsne_params, df_f1, reduction_method=reduction_method)
|
471 |
|
472 |
reg_metrics = pd.DataFrame({
|
|
|
476 |
})
|
477 |
st.table(reg_metrics)
|
478 |
|
|
|
|
|
479 |
data_table, df_table, source_table = create_table(result["df_distances"])
|
480 |
real_subset_names = list(df_table.columns[1:])
|
481 |
real_select = Select(title="", value=real_subset_names[0], options=real_subset_names)
|
482 |
reset_button = Button(label="Reset Colors", button_type="primary")
|
483 |
line_source = ColumnDataSource(data={'x': [], 'y': []})
|
484 |
+
|
485 |
fig, real_renderers, synthetic_renderers = create_figure(result["dfs_reduced"], result["unique_subsets"], get_color_maps(result["unique_subsets"]), model_name)
|
486 |
fig.line('x', 'y', source=line_source, line_width=2, line_color='black')
|
487 |
centers_real = calculate_cluster_centers(result["dfs_reduced"]["real"], result["unique_subsets"]["real"])
|
|
|
526 |
df_table.to_excel(buffer, index=False)
|
527 |
buffer.seek(0)
|
528 |
|
|
|
529 |
layout = column(fig, result["scatter_fig"], column(real_select, reset_button, data_table))
|
530 |
st.bokeh_chart(layout, use_container_width=True)
|
531 |
|