Spaces:

de-Rodrigo
/

Embeddings

Running

App Files Files Community

de-Rodrigo commited on Mar 4

Commit

ce05869

1 Parent(s): 789e1f0

Scatter Plot with Regression

Browse files

Files changed (1) hide show

app.py +87 -4

app.py CHANGED Viewed

@@ -2,13 +2,14 @@ import streamlit as st
 import pandas as pd
 import numpy as np
 from bokeh.plotting import figure
-from bokeh.models import ColumnDataSource, DataTable, TableColumn, CustomJS, Select, Button
 from bokeh.layouts import column
 from bokeh.palettes import Reds9, Blues9, Oranges9, Purples9, Greys9, BuGn9, Greens9
 from sklearn.decomposition import PCA
 from sklearn.manifold import TSNE
 import io
 import ot
 TOOLTIPS = """
 <div>
@@ -81,7 +82,9 @@ def reducer_selector(df_combined, embedding_cols):
     if reduction_method == "PCA":
         reducer = PCA(n_components=2)
     else:
-        reducer = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate=200)
     return reducer.fit_transform(all_embeddings)
 # Función para agregar datos reales (por cada etiqueta)
@@ -330,7 +333,86 @@ def run_model(model_name):
     centers_real = calculate_cluster_centers(dfs_reduced["real"], unique_subsets["real"])
-    df_distances = compute_wasserstein_distances_synthetic_individual(dfs_reduced["synthetic"], dfs_reduced["real"], unique_subsets["real"])
     data_table, df_table, source_table = create_table(df_distances)
     real_subset_names = list(df_table.columns[1:])
@@ -380,7 +462,7 @@ def run_model(model_name):
     df_table.to_excel(buffer, index=False)
     buffer.seek(0)
-    layout = column(fig, column(real_select, reset_button, data_table))
     st.bokeh_chart(layout, use_container_width=True)
     st.download_button(
@@ -391,6 +473,7 @@ def run_model(model_name):
         key=f"download_button_excel_{model_name}"
     )
 def main():
     config_style()
     tabs = st.tabs(["Donut", "Idefics2"])

 import pandas as pd
 import numpy as np
 from bokeh.plotting import figure
+from bokeh.models import ColumnDataSource, DataTable, TableColumn, CustomJS, Select, Button, HoverTool
 from bokeh.layouts import column
 from bokeh.palettes import Reds9, Blues9, Oranges9, Purples9, Greys9, BuGn9, Greens9
 from sklearn.decomposition import PCA
 from sklearn.manifold import TSNE
 import io
 import ot
+from sklearn.linear_model import LinearRegression
 TOOLTIPS = """
 <div>
     if reduction_method == "PCA":
         reducer = PCA(n_components=2)
     else:
+        perplexity_val = st.number_input("Perplexity", min_value=5, max_value=50, value=30, step=1)
+        learning_rate_val = st.number_input("Learning Rate", min_value=10, max_value=1000, value=200, step=10)
+        reducer = TSNE(n_components=2, random_state=42, perplexity=perplexity_val, learning_rate=learning_rate_val)
     return reducer.fit_transform(all_embeddings)
 # Función para agregar datos reales (por cada etiqueta)
     centers_real = calculate_cluster_centers(dfs_reduced["real"], unique_subsets["real"])
+    df_distances = compute_wasserstein_distances_synthetic_individual(
+        dfs_reduced["synthetic"],
+        dfs_reduced["real"],
+        unique_subsets["real"]
+    )
+    # --- Scatter plot usando f1-donut.csv ---
+    try:
+        df_f1 = pd.read_csv("data/f1-donut.csv", sep=';', index_col=0)
+    except Exception as e:
+        st.error(f"Error loading f1-donut.csv: {e}")
+        return
+    # Extraer los valores globales para cada fuente (sin promediar: 10 valores por fuente)
+    global_distances = {}
+    for idx in df_distances.index:
+        if idx.startswith("Global"):
+            # Ejemplo: "Global (es-digital-seq)"
+            source = idx.split("(")[1].rstrip(")")
+            global_distances[source] = df_distances.loc[idx].values
+    # Reutilización de los códigos de colores
+    source_colors = {
+        "es-digital-paragraph-degradation-seq": "blue",
+        "es-digital-line-degradation-seq": "green",
+        "es-digital-seq": "red",
+        "es-digital-zoom-degradation-seq": "orange",
+        "es-digital-rotation-degradation-seq": "purple",
+        "es-digital-rotation-zoom-degradation-seq": "brown",
+        "es-render-seq": "cyan"
+    }
+    scatter_fig = figure(width=600, height=600, tools="pan,wheel_zoom,reset,save", title="Scatter Plot: Wasserstein vs F1")
+    # Variables para la regresión global
+    all_x = []
+    all_y = []
+    # Se plotea cada fuente y se acumulan los datos para la regresión global
+    for source in df_f1.columns:
+        if source in global_distances:
+            x_vals = global_distances[source]      # 10 valores (uno por colegio)
+            y_vals = df_f1[source].values            # 10 valores de f1, en el mismo orden
+            data = {"x": x_vals, "y": y_vals, "Fuente": [source] * len(x_vals)}
+            cds = ColumnDataSource(data=data)
+            scatter_fig.circle('x', 'y', size=8, alpha=0.7, source=cds,
+                               fill_color=source_colors.get(source, "gray"),
+                               line_color=source_colors.get(source, "gray"),
+                               legend_label=source)
+            all_x.extend(x_vals)
+            all_y.extend(y_vals)
+    scatter_fig.xaxis.axis_label = "Wasserstein Distance (Global, por Colegio)"
+    scatter_fig.yaxis.axis_label = "F1 Score"
+    scatter_fig.legend.location = "top_right"
+    # Agregar HoverTool para mostrar x, y y la fuente al hacer hover
+    hover_tool = HoverTool(tooltips=[("x", "@x"), ("y", "@y"), ("Fuente", "@Fuente")])
+    scatter_fig.add_tools(hover_tool)
+    # --- Fin scatter plot ---
+    # --- Regresión global ---
+    all_x_arr = np.array(all_x).reshape(-1, 1)
+    all_y_arr = np.array(all_y)
+    model_global = LinearRegression().fit(all_x_arr, all_y_arr)
+    slope = model_global.coef_[0]
+    intercept = model_global.intercept_
+    r2 = model_global.score(all_x_arr, all_y_arr)
+    # Agregar línea de regresión global al scatter plot
+    x_line = np.linspace(all_x_arr.min(), all_x_arr.max(), 100)
+    y_line = model_global.predict(x_line.reshape(-1, 1))
+    scatter_fig.line(x_line, y_line, line_width=2, line_color="black", legend_label="Global Regression")
+    # Mostrar métricas de regresión después del scatter plot
+    regression_metrics = {"Slope": [slope], "Intercept": [intercept], "R2": [r2]}
+    reg_df = pd.DataFrame(regression_metrics)
+    st.table(reg_df)
+    # --- Fin regresión global ---
     data_table, df_table, source_table = create_table(df_distances)
     real_subset_names = list(df_table.columns[1:])
     df_table.to_excel(buffer, index=False)
     buffer.seek(0)
+    layout = column(fig, scatter_fig, column(real_select, reset_button, data_table))
     st.bokeh_chart(layout, use_container_width=True)
     st.download_button(
         key=f"download_button_excel_{model_name}"
     )
 def main():
     config_style()
     tabs = st.tabs(["Donut", "Idefics2"])