File size: 6,266 Bytes
c820b57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db438bf
 
 
 
 
c820b57
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import numpy as np
import gradio as gr
import plotly.graph_objects as go
from sklearn.datasets import make_circles
from sklearn.naive_bayes import BernoulliNB
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier


def plot_scatter(X, y, title):
    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=X[:, 0],
            y=X[:, 1],
            mode="markers",
            marker=dict(color=y, size=10, colorscale="Viridis", line=dict(width=1)),
        )
    )

    fig.update_layout(
        title=title,
        xaxis=dict(showticklabels=False),
        yaxis=dict(showticklabels=False)
    )

    return fig

def plot_decision_boundary(X, y, model, data_preprocess=None, title=None):
    # Creating Grid
    h = 0.01
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    grid = np.c_[xx.ravel(), yy.ravel()]

    # Creating Contour
    if data_preprocess:
        grid = data_preprocess.transform(grid)
    y_grid_pred = model.predict_proba(grid)[:, 1]

    # Plotting
    fig = go.Figure()
    fig.add_trace(
        go.Heatmap(
            x=np.arange(x_min, x_max, h),
            y=np.arange(y_min, y_max, h),
            z=y_grid_pred.reshape(xx.shape),
            colorscale="Viridis",
            opacity=0.8,
            showscale=False
        )
    )

    fig.add_trace(
        go.Scatter(
            x=X[:, 0],
            y=X[:, 1],
            mode="markers",
            marker=dict(color=y, size=10, colorscale="Viridis", line=dict(width=1)),
        )
    )

    fig.update_layout(
        title=title if title else "Decision Boundary",
        xaxis=dict(showticklabels=False),
        yaxis=dict(showticklabels=False)
    )

    return fig



def app_fn(
        factor: float, 
        random_state: int, 
        noise:float, 
        n_estimators: int, 
        max_depth: int
    ):
    # make a synthetic dataset
    X, y = make_circles(factor=factor, random_state=random_state, noise=noise)

    # use RandomTreesEmbedding to transform data
    hasher = RandomTreesEmbedding(n_estimators=n_estimators, random_state=random_state, max_depth=max_depth)
    X_transformed = hasher.fit_transform(X)

    # Visualize result after dimensionality reduction using truncated SVD
    svd = TruncatedSVD(n_components=2)
    X_reduced = svd.fit_transform(X_transformed)

    # Learn a Naive Bayes classifier on the transformed data
    nb = BernoulliNB()
    nb.fit(X_transformed, y)

    # Learn an ExtraTreesClassifier for comparison
    trees = ExtraTreesClassifier(max_depth=max_depth, n_estimators=n_estimators, random_state=random_state)
    trees.fit(X, y)

    # Plotting Original Data
    fig1 = plot_scatter(X, y, "Original Data")
    fig2 = plot_scatter(X_reduced, y, f"Truncated SVD Reduction (2D) of Transformed Data ({X_transformed.shape[1]})")
    fig3 = plot_decision_boundary(X, y, nb, hasher, "Naive Bayes Decision Boundary")
    fig4 = plot_decision_boundary(X, y, trees, title="Extra Trees Decision Boundary")

    return fig1, fig2, fig3, fig4

title = "Hashing Feature Transformation using Totally Random Trees"
with gr.Blocks() as demo:
    gr.Markdown(f"# {title}")
    gr.Markdown(
        """
        ### RandomTreesEmbedding provides a way to map data to a very high-dimensional, \
        sparse representation, which might be beneficial for classification. \
        The mapping is completely unsupervised and very efficient.

        ### This example visualizes the partitions given by several trees and shows how \
        the transformation can also be used for non-linear dimensionality reduction \
        or non-linear classification.

        ### Points that are neighboring often share the same leaf of a \
        tree and therefore share large parts of their hashed representation. \
        This allows to separate two concentric circles simply based on \
        the principal components of the transformed data with truncated SVD.

        ### In high-dimensional spaces, linear classifiers often achieve excellent \
        accuracy. For sparse binary data, BernoulliNB is particularly well-suited. \
        The bottom row compares the decision boundary obtained by BernoulliNB in the \
        transformed space with an ExtraTreesClassifier forests learned on the original data.

        [Original Example](https://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_embedding.html#sphx-glr-auto-examples-ensemble-plot-random-forest-embedding-py)
        """
    )
    with gr.Row():
        factor = gr.inputs.Slider(minimum=0.05, maximum=1.0, step=0.01, default=0.5, label="Factor")
        noise = gr.inputs.Slider(minimum=0.0, maximum=1.0, step=0.01, default=0.05, label="Noise")
        n_estimators = gr.inputs.Slider(minimum=1, maximum=100, step=1, default=10, label="Number of Estimators")
        max_depth = gr.inputs.Slider(minimum=1, maximum=100, step=1, default=3, label="Max Depth")
        random_state = gr.inputs.Slider(minimum=0, maximum=100, step=1, default=0, label="Random State")
    with gr.Row():
        plot1 = gr.Plot(label="Origianl Data")
        plot2 = gr.Plot(label="Truncated Date")
    with gr.Row():
        plot3 = gr.Plot(label="Naive Bayes Decision Boundary")
        plot4 = gr.Plot(label="Extra Trees Decision Boundary")
    
    factor.change(app_fn, outputs=[plot1, plot2, plot3, plot4], inputs=[factor, random_state, noise, n_estimators, max_depth])
    noise.change(app_fn, outputs=[plot1, plot2, plot3, plot4], inputs=[factor, random_state, noise, n_estimators, max_depth])
    n_estimators.change(app_fn, outputs=[plot1, plot2, plot3, plot4], inputs=[factor, random_state, noise, n_estimators, max_depth])
    max_depth.change(app_fn, outputs=[plot1, plot2, plot3, plot4], inputs=[factor, random_state, noise, n_estimators, max_depth])
    random_state.change(app_fn, outputs=[plot1, plot2, plot3, plot4], inputs=[factor, random_state, noise, n_estimators, max_depth])
    demo.load(app_fn, inputs=[factor, random_state, noise, n_estimators, max_depth], outputs=[plot1, plot2, plot3, plot4])

demo.launch()