|
""" |
|
======================================================= |
|
Comparison of LDA and PCA 2D projection of Iris dataset |
|
======================================================= |
|
|
|
The Iris dataset represents 3 kind of Iris flowers (Setosa, Versicolour |
|
and Virginica) with 4 attributes: sepal length, sepal width, petal length |
|
and petal width. |
|
|
|
Principal Component Analysis (PCA) applied to this data identifies the |
|
combination of attributes (principal components, or directions in the |
|
feature space) that account for the most variance in the data. Here we |
|
plot the different samples on the 2 first principal components. |
|
|
|
Linear Discriminant Analysis (LDA) tries to identify attributes that |
|
account for the most variance *between classes*. In particular, |
|
LDA, in contrast to PCA, is a supervised method, using known class labels. |
|
|
|
""" |
|
|
|
import matplotlib.pyplot as plt |
|
import gradio as gr |
|
from sklearn import datasets |
|
from sklearn.decomposition import PCA |
|
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis |
|
|
|
|
|
iris = datasets.load_iris() |
|
|
|
X = iris.data |
|
y = iris.target |
|
target_names = iris.target_names |
|
|
|
def plot_lda_pca(n_samples = 100, |
|
n_components=2, |
|
n_features=4): |
|
|
|
''' |
|
Function to plot LDA and PCA clustering. |
|
|
|
Parameters |
|
---------- |
|
n_components : int, default=2 |
|
Number of components to keep. |
|
|
|
n_features : int, default=5 |
|
Number of features to generate. |
|
|
|
Returns |
|
------- |
|
fig : matplotlib.pyplot.figure |
|
Figure object. |
|
''' |
|
|
|
|
|
|
|
|
|
X = X[:n_samples, :n_features] |
|
y = y[:n_samples] |
|
|
|
|
|
pca = PCA(n_components=n_components) |
|
X_r = pca.fit(X).transform(X) |
|
print(f"shape of X_r: {X_r.shape}") |
|
|
|
lda = LinearDiscriminantAnalysis(n_components=n_components) |
|
X_r2 = lda.fit(X, y).transform(X) |
|
print(f"shape of X_r2: {X_r2.shape}") |
|
|
|
X_r = X_r[:, :2] |
|
X_r2 = X_r2[:, :2] |
|
|
|
print(f"shape of X_r after: {X_r.shape}") |
|
print(f"shape of X_r2 after: {X_r2.shape}") |
|
|
|
|
|
print( |
|
"explained variance ratio (first two components): %s" |
|
% str(pca.explained_variance_ratio_) |
|
) |
|
|
|
|
|
fig, axes = plt.subplots(2,1, sharey= False, sharex=False, figsize = (8,6)) |
|
colors = ["navy", "turquoise", "darkorange"] |
|
lw = 2 |
|
|
|
for color, i, target_name in zip(colors, [0, 1, 2], target_names): |
|
axes[0].scatter( |
|
X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=0.8, lw=lw, label=target_name |
|
) |
|
axes[0].legend(loc="lower right") |
|
axes[0].set_title("PCA of IRIS dataset") |
|
for color, i, target_name in zip(colors, [0, 1, 2], target_names): |
|
axes[1].scatter( |
|
X_r2[y == i, 0], X_r2[y == i, 1], alpha=0.8, color=color, label=target_name |
|
) |
|
plt.legend(loc="best", shadow=False, scatterpoints=1) |
|
axes[1].legend(loc="lower right") |
|
axes[1].set_title("LDA of IRIS dataset") |
|
plt.tight_layout() |
|
|
|
|
|
return fig |
|
|
|
|
|
title = "2-D projection of Iris dataset using LDA and PCA" |
|
with gr.Blocks(title=title) as demo: |
|
gr.Markdown(f"# {title}") |
|
gr.Markdown(" This example shows how one can use Prinicipal Components Analysis (PCA) and Factor Analysis (FA) for model selection by observing the likelihood of a held-out dataset with added noise <br>" |
|
" The number of samples (n_samples) will determine the number of data points to produce. <br>" |
|
" The number of components (n_components) will determine the number of components each method will fit to, and will affect the likelihood of the held-out set. <br>" |
|
" The number of features (n_components) determine the number of features the toy dataset X variable will have. <br>" |
|
" For further details please see the sklearn docs:" |
|
) |
|
|
|
gr.Markdown(" **[Demo is based on sklearn docs found here](https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_vs_lda.html#sphx-glr-auto-examples-decomposition-plot-pca-vs-lda-py)** <br>") |
|
|
|
gr.Markdown(" **Dataset** : A toy dataset with corrupted with homoscedastic noise (noise variance is the same for each feature) or heteroscedastic noise (noise variance is the different for each feature) . <br>") |
|
gr.Markdown(" Different number of features and number of components affect how well the low rank space is recovered. <br>" |
|
" Larger Depth trying to overfit and learn even the finner details of the data.<br>" |
|
) |
|
|
|
max_samples = len(iris.data) |
|
with gr.Row(): |
|
n_samples = gr.Slider(value=100, minimum=2, maximum=max_samples, step=1, label="n_samples") |
|
n_features = gr.Slider(value=4, minimum=2, maximum=4, step=1, label="n_features") |
|
|
|
|
|
btn = gr.Button(value="Run") |
|
btn.click(plot_lda_pca,inputs= [n_samples, n_features], outputs= gr.Plot(label='PCA vs LDA clustering') ) |
|
|
|
|
|
demo.launch() |