vumichien commited on
Commit
e88b390
·
1 Parent(s): 10b1e17

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +164 -0
app.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import time
3
+
4
+ import numpy as np
5
+ import matplotlib.pyplot as plt
6
+
7
+ from sklearn import ensemble
8
+ from sklearn import datasets
9
+ from sklearn.model_selection import train_test_split
10
+
11
+ theme = gr.themes.Monochrome(
12
+ primary_hue="indigo",
13
+ secondary_hue="blue",
14
+ neutral_hue="slate",
15
+ )
16
+ model_card = f"""
17
+ ## Description
18
+
19
+ **Gradient boosting** is a machine learning technique that combines several regression trees to create a powerful model in an iterative manner.
20
+ **Early stopping** is a technique used in **gradient boosting** to determine the least number of iterations required to create a model that generalizes well to new data.
21
+ It involves specifying a validation set and using it to evaluate the model after each stage of tree building.
22
+ The process is continued until the model's scores do not improve for a specified number of stages.
23
+ Using early stopping can significantly reduce training time, memory usage, and prediction latency while achieving almost the same accuracy as a model built without early stopping using many more estimators.
24
+ You can play around with different ``number of samples`` and ``number of new estimators`` to see the effect
25
+
26
+ ## Dataset
27
+
28
+ Iris dataset, Classification dataset, Hastie dataset
29
+ """
30
+
31
+
32
+ def do_train(n_samples, n_estimators, progress=gr.Progress()):
33
+
34
+ data_list = [
35
+ datasets.load_iris(return_X_y=True),
36
+ datasets.make_classification(n_samples=n_samples, random_state=0),
37
+ datasets.make_hastie_10_2(n_samples=n_samples, random_state=0),
38
+ ]
39
+ names = ["Iris Data", "Classification Data", "Hastie Data"]
40
+
41
+ n_gb = []
42
+ score_gb = []
43
+ time_gb = []
44
+ n_gbes = []
45
+ score_gbes = []
46
+ time_gbes = []
47
+
48
+ for X, y in progress.tqdm(data_list):
49
+ X_train, X_test, y_train, y_test = train_test_split(
50
+ X, y, test_size=0.2, random_state=0
51
+ )
52
+ # We specify that if the scores don't improve by at least 0.01 for the last
53
+ # 10 stages, stop fitting additional stages
54
+ gbes = ensemble.GradientBoostingClassifier(
55
+ n_estimators=n_estimators,
56
+ validation_fraction=0.2,
57
+ n_iter_no_change=5,
58
+ tol=0.01,
59
+ random_state=0,
60
+ )
61
+ gb = ensemble.GradientBoostingClassifier(n_estimators=n_estimators, random_state=0)
62
+ start = time.time()
63
+ gb.fit(X_train, y_train)
64
+ time_gb.append(time.time() - start)
65
+
66
+ start = time.time()
67
+ gbes.fit(X_train, y_train)
68
+ time_gbes.append(time.time() - start)
69
+
70
+ score_gb.append(gb.score(X_test, y_test))
71
+ score_gbes.append(gbes.score(X_test, y_test))
72
+
73
+ n_gb.append(gb.n_estimators_)
74
+ n_gbes.append(gbes.n_estimators_)
75
+
76
+ bar_width = 0.2
77
+ n = len(data_list)
78
+ index = np.arange(0, n * bar_width, bar_width) * 2.5
79
+ index = index[0:n]
80
+
81
+ fig1, axes1 = plt.subplots(figsize=(9, 5))
82
+
83
+ bar1 = axes1.bar(
84
+ index, score_gb, bar_width, label="Without early stopping", color="crimson"
85
+ )
86
+ bar2 = axes1.bar(
87
+ index + bar_width, score_gbes, bar_width, label="With early stopping", color="coral"
88
+ )
89
+ axes1.set_xticks(index + bar_width, names);
90
+ axes1.set_yticks(np.arange(0, 1.3, 0.1));
91
+
92
+ def autolabel(ax, rects, n_estimators):
93
+ """
94
+ Attach a text label above each bar displaying n_estimators of each model
95
+ """
96
+ for i, rect in enumerate(rects):
97
+ ax.text(
98
+ rect.get_x() + rect.get_width() / 2.0,
99
+ 1.05 * rect.get_height(),
100
+ "n_est=%d" % n_estimators[i],
101
+ ha="center",
102
+ va="bottom",
103
+ )
104
+ autolabel(axes1, bar1, n_gb)
105
+ autolabel(axes1, bar2, n_gbes)
106
+ plt.xlabel("Datasets")
107
+ plt.ylabel("Test score")
108
+
109
+ axes1.set_xlabel("Datasets")
110
+ axes1.set_ylabel("Test score")
111
+ axes1.set_ylim([0, 1.3])
112
+ axes1.legend(loc="best")
113
+ axes1.grid(True)
114
+
115
+
116
+ fig2, axes2 = plt.subplots(figsize=(9, 5))
117
+
118
+ bar1 = axes2.bar(
119
+ index, time_gb, bar_width, label="Without early stopping", color="crimson"
120
+ )
121
+ bar2 = axes2.bar(
122
+ index + bar_width, time_gbes, bar_width, label="With early stopping", color="coral"
123
+ )
124
+
125
+ max_y = np.amax(np.maximum(time_gb, time_gbes))
126
+
127
+ axes2.set_xticks(index + bar_width, names)
128
+ axes2.set_yticks(np.linspace(0, 1.3 * max_y, 13))
129
+
130
+ autolabel(axes2, bar1, n_gb)
131
+ autolabel(axes2, bar2, n_gbes)
132
+
133
+ axes2.set_ylim([0, 1.3 * max_y])
134
+ axes2.legend(loc="best")
135
+ axes2.grid(True)
136
+
137
+ axes2.set_xlabel("Datasets")
138
+ axes2.set_ylabel("Fit Time")
139
+
140
+
141
+ return fig1, fig2
142
+
143
+
144
+
145
+ with gr.Blocks(theme=theme) as demo:
146
+ gr.Markdown('''
147
+ <div>
148
+ <h1 style='text-align: center'>Early stopping of Gradient Boosting</h1>
149
+ </div>
150
+ ''')
151
+ gr.Markdown(model_card)
152
+ gr.Markdown("Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>. Based on the example from <a href=\"https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_early_stopping.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-early-stopping-py\">scikit-learn</a>")
153
+ n_samples = gr.Slider(minimum=500, maximum=10000, step=500, value=1000, label="Number of samples")
154
+ n_estimators = gr.Slider(minimum=50, maximum=300, step=50, value=100, label="Number of estimators")
155
+ with gr.Row():
156
+ with gr.Column():
157
+ plot1 = gr.Plot(label="Test score")
158
+ with gr.Column():
159
+ plot2 = gr.Plot(label="Running time")
160
+
161
+ n_samples.change(fn=do_train, inputs=[n_samples, n_estimators], outputs=[plot1, plot2])
162
+ n_estimators.change(fn=do_train, inputs=[n_samples, n_estimators], outputs=[plot1, plot2])
163
+
164
+ demo.launch(enable_queue=True)