de-Rodrigo commited on
Commit
4279043
1 Parent(s): ce05869

TSNE Parameters Optimization

Browse files
Files changed (1) hide show
  1. app.py +89 -3
app.py CHANGED
@@ -312,19 +312,105 @@ def create_table(df_distances):
312
  data_table = DataTable(source=source_table, columns=columns, sizing_mode='stretch_width', height=total_height)
313
  return data_table, df_table, source_table
314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  def run_model(model_name):
316
  embeddings = load_embeddings(model_name)
317
  if embeddings is None:
318
  return
319
-
320
  embedding_cols = [col for col in embeddings["real"].columns if col.startswith("dim_")]
321
  df_combined = pd.concat(list(embeddings.values()), ignore_index=True)
 
 
 
 
 
 
 
 
322
  st.markdown('<h6 class="sub-title">Select Dimensionality Reduction Method</h6>', unsafe_allow_html=True)
323
  reduction_method = st.selectbox("", options=["t-SNE", "PCA"], key=f"reduction_{model_name}")
 
 
 
 
 
 
 
 
 
324
  if reduction_method == "PCA":
325
  reducer = PCA(n_components=2)
326
  else:
327
- reducer = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate=200)
 
 
 
328
  reduced = reducer.fit_transform(df_combined[embedding_cols].values)
329
  dfs_reduced, unique_subsets = split_versions(df_combined, reduced)
330
 
@@ -389,7 +475,7 @@ def run_model(model_name):
389
  scatter_fig.legend.location = "top_right"
390
 
391
  # Agregar HoverTool para mostrar x, y y la fuente al hacer hover
392
- hover_tool = HoverTool(tooltips=[("x", "@x"), ("y", "@y"), ("Fuente", "@Fuente")])
393
  scatter_fig.add_tools(hover_tool)
394
  # --- Fin scatter plot ---
395
 
 
312
  data_table = DataTable(source=source_table, columns=columns, sizing_mode='stretch_width', height=total_height)
313
  return data_table, df_table, source_table
314
 
315
+ def optimize_tsne_params(df_combined, embedding_cols, df_f1):
316
+ # Rangos de b煤squeda (puedes ajustar estos l铆mites y pasos)
317
+ perplexity_range = np.linspace(30, 50, 10)
318
+ learning_rate_range = np.linspace(200, 1000, 20)
319
+
320
+ best_R2 = -np.inf
321
+ best_params = None
322
+ total_steps = len(perplexity_range) * len(learning_rate_range)
323
+ step = 0
324
+
325
+ # Usamos un placeholder de Streamlit para actualizar mensajes de progreso
326
+ progress_text = st.empty()
327
+
328
+ for p in perplexity_range:
329
+ for lr in learning_rate_range:
330
+ step += 1
331
+ # Actualizamos el mensaje de progreso
332
+ progress_text.text(f"Evaluating: Perplexity={p:.2f}, Learning Rate={lr:.2f} (Step: {step}/{total_steps})")
333
+
334
+ # Calcular la reducci贸n con TSNE
335
+ reducer_temp = TSNE(n_components=2, random_state=42, perplexity=p, learning_rate=lr)
336
+ reduced_temp = reducer_temp.fit_transform(df_combined[embedding_cols].values)
337
+ dfs_reduced_temp, unique_subsets_temp = split_versions(df_combined, reduced_temp)
338
+
339
+ # Calcular distancias Wasserstein
340
+ df_distances_temp = compute_wasserstein_distances_synthetic_individual(
341
+ dfs_reduced_temp["synthetic"],
342
+ dfs_reduced_temp["real"],
343
+ unique_subsets_temp["real"]
344
+ )
345
+ # Extraer los valores globales (suponemos 10 por fuente)
346
+ global_distances_temp = {}
347
+ for idx in df_distances_temp.index:
348
+ if idx.startswith("Global"):
349
+ source = idx.split("(")[1].rstrip(")")
350
+ global_distances_temp[source] = df_distances_temp.loc[idx].values
351
+
352
+ # Acumular datos para la regresi贸n global
353
+ all_x_temp = []
354
+ all_y_temp = []
355
+ for source in df_f1.columns:
356
+ if source in global_distances_temp:
357
+ x_vals_temp = global_distances_temp[source]
358
+ y_vals_temp = df_f1[source].values
359
+ all_x_temp.extend(x_vals_temp)
360
+ all_y_temp.extend(y_vals_temp)
361
+ if len(all_x_temp) == 0:
362
+ continue
363
+ all_x_temp_arr = np.array(all_x_temp).reshape(-1, 1)
364
+ all_y_temp_arr = np.array(all_y_temp)
365
+
366
+ model_temp = LinearRegression().fit(all_x_temp_arr, all_y_temp_arr)
367
+ r2_temp = model_temp.score(all_x_temp_arr, all_y_temp_arr)
368
+
369
+ # Mostrar en pantalla (o log) la tupla evaluada y el R虏 obtenido
370
+ st.write(f"Parameters: Perplexity={p:.2f}, Learning Rate={lr:.2f} -> R虏={r2_temp:.4f}")
371
+
372
+ if r2_temp > best_R2:
373
+ best_R2 = r2_temp
374
+ best_params = (p, lr)
375
+
376
+ progress_text.text("Optimization completed!")
377
+ return best_params, best_R2
378
+
379
+
380
+
381
  def run_model(model_name):
382
  embeddings = load_embeddings(model_name)
383
  if embeddings is None:
384
  return
385
+
386
  embedding_cols = [col for col in embeddings["real"].columns if col.startswith("dim_")]
387
  df_combined = pd.concat(list(embeddings.values()), ignore_index=True)
388
+
389
+ # Leer el CSV de f1-donut (usado para evaluar la regresi贸n)
390
+ try:
391
+ df_f1 = pd.read_csv("data/f1-donut.csv", sep=';', index_col=0)
392
+ except Exception as e:
393
+ st.error(f"Error loading f1-donut.csv: {e}")
394
+ return
395
+
396
  st.markdown('<h6 class="sub-title">Select Dimensionality Reduction Method</h6>', unsafe_allow_html=True)
397
  reduction_method = st.selectbox("", options=["t-SNE", "PCA"], key=f"reduction_{model_name}")
398
+
399
+ # Opci贸n para optimizar los par谩metros TSNE
400
+ if reduction_method == "t-SNE":
401
+ if st.button("Optimize TSNE parameters", key=f"optimize_tnse_{model_name}"):
402
+ st.info("Running optimization, this can take a while...")
403
+ best_params, best_R2 = optimize_tsne_params(df_combined, embedding_cols, df_f1)
404
+ st.success(f"Mejores par谩metros: Perplexity = {best_params[0]:.2f}, Learning Rate = {best_params[1]:.2f} con R虏 = {best_R2:.4f}")
405
+
406
+ # Permitir al usuario ingresar manualmente los valores (o podr铆as reemplazar estos por los optimizados)
407
  if reduction_method == "PCA":
408
  reducer = PCA(n_components=2)
409
  else:
410
+ perplexity_val = st.number_input("Perplexity", min_value=5, max_value=50, value=30, step=1, key=f"perplexity_{model_name}")
411
+ learning_rate_val = st.number_input("Learning Rate", min_value=10, max_value=1000, value=200, step=10, key=f"learning_rate_{model_name}")
412
+ reducer = TSNE(n_components=2, random_state=42, perplexity=perplexity_val, learning_rate=learning_rate_val)
413
+
414
  reduced = reducer.fit_transform(df_combined[embedding_cols].values)
415
  dfs_reduced, unique_subsets = split_versions(df_combined, reduced)
416
 
 
475
  scatter_fig.legend.location = "top_right"
476
 
477
  # Agregar HoverTool para mostrar x, y y la fuente al hacer hover
478
+ hover_tool = HoverTool(tooltips=[("Wass. Distance", "@x"), ("f1", "@y"), ("Subset", "@Fuente")])
479
  scatter_fig.add_tools(hover_tool)
480
  # --- Fin scatter plot ---
481