Spaces:
Sleeping
Sleeping
Commit
路
4279043
1
Parent(s):
ce05869
TSNE Parameters Optimization
Browse files
app.py
CHANGED
@@ -312,19 +312,105 @@ def create_table(df_distances):
|
|
312 |
data_table = DataTable(source=source_table, columns=columns, sizing_mode='stretch_width', height=total_height)
|
313 |
return data_table, df_table, source_table
|
314 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
def run_model(model_name):
|
316 |
embeddings = load_embeddings(model_name)
|
317 |
if embeddings is None:
|
318 |
return
|
319 |
-
|
320 |
embedding_cols = [col for col in embeddings["real"].columns if col.startswith("dim_")]
|
321 |
df_combined = pd.concat(list(embeddings.values()), ignore_index=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
st.markdown('<h6 class="sub-title">Select Dimensionality Reduction Method</h6>', unsafe_allow_html=True)
|
323 |
reduction_method = st.selectbox("", options=["t-SNE", "PCA"], key=f"reduction_{model_name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
if reduction_method == "PCA":
|
325 |
reducer = PCA(n_components=2)
|
326 |
else:
|
327 |
-
|
|
|
|
|
|
|
328 |
reduced = reducer.fit_transform(df_combined[embedding_cols].values)
|
329 |
dfs_reduced, unique_subsets = split_versions(df_combined, reduced)
|
330 |
|
@@ -389,7 +475,7 @@ def run_model(model_name):
|
|
389 |
scatter_fig.legend.location = "top_right"
|
390 |
|
391 |
# Agregar HoverTool para mostrar x, y y la fuente al hacer hover
|
392 |
-
hover_tool = HoverTool(tooltips=[("
|
393 |
scatter_fig.add_tools(hover_tool)
|
394 |
# --- Fin scatter plot ---
|
395 |
|
|
|
312 |
data_table = DataTable(source=source_table, columns=columns, sizing_mode='stretch_width', height=total_height)
|
313 |
return data_table, df_table, source_table
|
314 |
|
315 |
+
def optimize_tsne_params(df_combined, embedding_cols, df_f1):
|
316 |
+
# Rangos de b煤squeda (puedes ajustar estos l铆mites y pasos)
|
317 |
+
perplexity_range = np.linspace(30, 50, 10)
|
318 |
+
learning_rate_range = np.linspace(200, 1000, 20)
|
319 |
+
|
320 |
+
best_R2 = -np.inf
|
321 |
+
best_params = None
|
322 |
+
total_steps = len(perplexity_range) * len(learning_rate_range)
|
323 |
+
step = 0
|
324 |
+
|
325 |
+
# Usamos un placeholder de Streamlit para actualizar mensajes de progreso
|
326 |
+
progress_text = st.empty()
|
327 |
+
|
328 |
+
for p in perplexity_range:
|
329 |
+
for lr in learning_rate_range:
|
330 |
+
step += 1
|
331 |
+
# Actualizamos el mensaje de progreso
|
332 |
+
progress_text.text(f"Evaluating: Perplexity={p:.2f}, Learning Rate={lr:.2f} (Step: {step}/{total_steps})")
|
333 |
+
|
334 |
+
# Calcular la reducci贸n con TSNE
|
335 |
+
reducer_temp = TSNE(n_components=2, random_state=42, perplexity=p, learning_rate=lr)
|
336 |
+
reduced_temp = reducer_temp.fit_transform(df_combined[embedding_cols].values)
|
337 |
+
dfs_reduced_temp, unique_subsets_temp = split_versions(df_combined, reduced_temp)
|
338 |
+
|
339 |
+
# Calcular distancias Wasserstein
|
340 |
+
df_distances_temp = compute_wasserstein_distances_synthetic_individual(
|
341 |
+
dfs_reduced_temp["synthetic"],
|
342 |
+
dfs_reduced_temp["real"],
|
343 |
+
unique_subsets_temp["real"]
|
344 |
+
)
|
345 |
+
# Extraer los valores globales (suponemos 10 por fuente)
|
346 |
+
global_distances_temp = {}
|
347 |
+
for idx in df_distances_temp.index:
|
348 |
+
if idx.startswith("Global"):
|
349 |
+
source = idx.split("(")[1].rstrip(")")
|
350 |
+
global_distances_temp[source] = df_distances_temp.loc[idx].values
|
351 |
+
|
352 |
+
# Acumular datos para la regresi贸n global
|
353 |
+
all_x_temp = []
|
354 |
+
all_y_temp = []
|
355 |
+
for source in df_f1.columns:
|
356 |
+
if source in global_distances_temp:
|
357 |
+
x_vals_temp = global_distances_temp[source]
|
358 |
+
y_vals_temp = df_f1[source].values
|
359 |
+
all_x_temp.extend(x_vals_temp)
|
360 |
+
all_y_temp.extend(y_vals_temp)
|
361 |
+
if len(all_x_temp) == 0:
|
362 |
+
continue
|
363 |
+
all_x_temp_arr = np.array(all_x_temp).reshape(-1, 1)
|
364 |
+
all_y_temp_arr = np.array(all_y_temp)
|
365 |
+
|
366 |
+
model_temp = LinearRegression().fit(all_x_temp_arr, all_y_temp_arr)
|
367 |
+
r2_temp = model_temp.score(all_x_temp_arr, all_y_temp_arr)
|
368 |
+
|
369 |
+
# Mostrar en pantalla (o log) la tupla evaluada y el R虏 obtenido
|
370 |
+
st.write(f"Parameters: Perplexity={p:.2f}, Learning Rate={lr:.2f} -> R虏={r2_temp:.4f}")
|
371 |
+
|
372 |
+
if r2_temp > best_R2:
|
373 |
+
best_R2 = r2_temp
|
374 |
+
best_params = (p, lr)
|
375 |
+
|
376 |
+
progress_text.text("Optimization completed!")
|
377 |
+
return best_params, best_R2
|
378 |
+
|
379 |
+
|
380 |
+
|
381 |
def run_model(model_name):
|
382 |
embeddings = load_embeddings(model_name)
|
383 |
if embeddings is None:
|
384 |
return
|
385 |
+
|
386 |
embedding_cols = [col for col in embeddings["real"].columns if col.startswith("dim_")]
|
387 |
df_combined = pd.concat(list(embeddings.values()), ignore_index=True)
|
388 |
+
|
389 |
+
# Leer el CSV de f1-donut (usado para evaluar la regresi贸n)
|
390 |
+
try:
|
391 |
+
df_f1 = pd.read_csv("data/f1-donut.csv", sep=';', index_col=0)
|
392 |
+
except Exception as e:
|
393 |
+
st.error(f"Error loading f1-donut.csv: {e}")
|
394 |
+
return
|
395 |
+
|
396 |
st.markdown('<h6 class="sub-title">Select Dimensionality Reduction Method</h6>', unsafe_allow_html=True)
|
397 |
reduction_method = st.selectbox("", options=["t-SNE", "PCA"], key=f"reduction_{model_name}")
|
398 |
+
|
399 |
+
# Opci贸n para optimizar los par谩metros TSNE
|
400 |
+
if reduction_method == "t-SNE":
|
401 |
+
if st.button("Optimize TSNE parameters", key=f"optimize_tnse_{model_name}"):
|
402 |
+
st.info("Running optimization, this can take a while...")
|
403 |
+
best_params, best_R2 = optimize_tsne_params(df_combined, embedding_cols, df_f1)
|
404 |
+
st.success(f"Mejores par谩metros: Perplexity = {best_params[0]:.2f}, Learning Rate = {best_params[1]:.2f} con R虏 = {best_R2:.4f}")
|
405 |
+
|
406 |
+
# Permitir al usuario ingresar manualmente los valores (o podr铆as reemplazar estos por los optimizados)
|
407 |
if reduction_method == "PCA":
|
408 |
reducer = PCA(n_components=2)
|
409 |
else:
|
410 |
+
perplexity_val = st.number_input("Perplexity", min_value=5, max_value=50, value=30, step=1, key=f"perplexity_{model_name}")
|
411 |
+
learning_rate_val = st.number_input("Learning Rate", min_value=10, max_value=1000, value=200, step=10, key=f"learning_rate_{model_name}")
|
412 |
+
reducer = TSNE(n_components=2, random_state=42, perplexity=perplexity_val, learning_rate=learning_rate_val)
|
413 |
+
|
414 |
reduced = reducer.fit_transform(df_combined[embedding_cols].values)
|
415 |
dfs_reduced, unique_subsets = split_versions(df_combined, reduced)
|
416 |
|
|
|
475 |
scatter_fig.legend.location = "top_right"
|
476 |
|
477 |
# Agregar HoverTool para mostrar x, y y la fuente al hacer hover
|
478 |
+
hover_tool = HoverTool(tooltips=[("Wass. Distance", "@x"), ("f1", "@y"), ("Subset", "@Fuente")])
|
479 |
scatter_fig.add_tools(hover_tool)
|
480 |
# --- Fin scatter plot ---
|
481 |
|