Spaces:
Running
Running
raymondEDS
commited on
Commit
·
4a23d33
1
Parent(s):
f91be81
Week 6 logistic regression
Browse files- Reference files/w6_logistic_regression_lab.py +400 -0
- app/__pycache__/main.cpython-311.pyc +0 -0
- app/main.py +6 -4
- app/pages/__pycache__/week_2.cpython-311.pyc +0 -0
- app/pages/__pycache__/week_5.cpython-311.pyc +0 -0
- app/pages/__pycache__/week_6.cpython-311.pyc +0 -0
- app/pages/week_6.py +803 -0
- requirements.txt +2 -1
Reference files/w6_logistic_regression_lab.py
ADDED
@@ -0,0 +1,400 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""W6_Logistic_regression_lab
|
3 |
+
|
4 |
+
Automatically generated by Colab.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1MG7N2HN-Nxow9fzvc0fzxvp3WyKqtgs8
|
8 |
+
|
9 |
+
# 🚀 Logistic Regression Lab: Stock Market Prediction
|
10 |
+
|
11 |
+
## Lab Overview
|
12 |
+
In this lab, we'll use logistic regression to try predicting whether the stock market goes up or down. Spoiler alert: This is intentionally a challenging prediction problem that will teach us important lessons about when logistic regression works well and when it doesn't.
|
13 |
+
## Learning Goals:
|
14 |
+
|
15 |
+
- Apply logistic regression to real data
|
16 |
+
- Interpret probabilities and coefficients
|
17 |
+
- Understand why some prediction problems are inherently difficult
|
18 |
+
- Learn proper model evaluation techniques
|
19 |
+
|
20 |
+
## The Stock Market Data
|
21 |
+
|
22 |
+
In this lab we will examine the `Smarket`
|
23 |
+
data, which is part of the `ISLP`
|
24 |
+
library. This data set consists of percentage returns for the S&P 500
|
25 |
+
stock index over 1,250 days, from the beginning of 2001 until the end
|
26 |
+
of 2005. For each date, we have recorded the percentage returns for
|
27 |
+
each of the five previous trading days, `Lag1` through
|
28 |
+
`Lag5`. We have also recorded `Volume` (the number of
|
29 |
+
shares traded on the previous day, in billions), `Today` (the
|
30 |
+
percentage return on the date in question) and `Direction`
|
31 |
+
(whether the market was `Up` or `Down` on this date).
|
32 |
+
|
33 |
+
### Your Challenge
|
34 |
+
**Question**: Can we predict if the S&P 500 will go up or down based on recent trading patterns?
|
35 |
+
|
36 |
+
**Why This Matters:** If predictable, this would be incredibly valuable. If not predictable, we learn about market efficiency and realistic expectations for prediction models.
|
37 |
+
|
38 |
+
|
39 |
+
To answer the question, **we start by importing our libraries at this top level; these are all imports we have seen in previous labs.**
|
40 |
+
"""
|
41 |
+
|
42 |
+
import numpy as np
|
43 |
+
import pandas as pd
|
44 |
+
from matplotlib.pyplot import subplots
|
45 |
+
import statsmodels.api as sm
|
46 |
+
from ISLP import load_data
|
47 |
+
from ISLP.models import (ModelSpec as MS,
|
48 |
+
summarize)
|
49 |
+
|
50 |
+
"""We also collect together the new imports needed for this lab."""
|
51 |
+
|
52 |
+
from ISLP import confusion_table
|
53 |
+
from ISLP.models import contrast
|
54 |
+
from sklearn.discriminant_analysis import \
|
55 |
+
(LinearDiscriminantAnalysis as LDA,
|
56 |
+
QuadraticDiscriminantAnalysis as QDA)
|
57 |
+
from sklearn.naive_bayes import GaussianNB
|
58 |
+
from sklearn.neighbors import KNeighborsClassifier
|
59 |
+
from sklearn.preprocessing import StandardScaler
|
60 |
+
from sklearn.model_selection import train_test_split
|
61 |
+
from sklearn.linear_model import LogisticRegression
|
62 |
+
|
63 |
+
"""Now we are ready to load the `Smarket` data."""
|
64 |
+
|
65 |
+
Smarket = load_data('Smarket')
|
66 |
+
Smarket
|
67 |
+
|
68 |
+
"""This gives a truncated listing of the data.
|
69 |
+
We can see what the variable names are.
|
70 |
+
"""
|
71 |
+
|
72 |
+
Smarket.columns
|
73 |
+
|
74 |
+
"""We compute the correlation matrix using the `corr()` method
|
75 |
+
for data frames, which produces a matrix that contains all of
|
76 |
+
the pairwise correlations among the variables.
|
77 |
+
|
78 |
+
By instructing `pandas` to use only numeric variables, the `corr()` method does not report a correlation for the `Direction` variable because it is
|
79 |
+
qualitative.
|
80 |
+
|
81 |
+

|
82 |
+
"""
|
83 |
+
|
84 |
+
Smarket.corr(numeric_only=True)
|
85 |
+
|
86 |
+
"""As one would expect, the correlations between the lagged return variables and
|
87 |
+
today’s return are close to zero. The only substantial correlation is between `Year` and
|
88 |
+
`Volume`. By plotting the data we see that `Volume`
|
89 |
+
is increasing over time. In other words, the average number of shares traded
|
90 |
+
daily increased from 2001 to 2005.
|
91 |
+
|
92 |
+
"""
|
93 |
+
|
94 |
+
Smarket.plot(y='Volume');
|
95 |
+
|
96 |
+
"""## Logistic Regression
|
97 |
+
Next, we will fit a logistic regression model in order to predict
|
98 |
+
`Direction` using `Lag1` through `Lag5` and
|
99 |
+
`Volume`. The `sm.GLM()` function fits *generalized linear models*, a class of
|
100 |
+
models that includes logistic regression. Alternatively,
|
101 |
+
the function `sm.Logit()` fits a logistic regression
|
102 |
+
model directly. The syntax of
|
103 |
+
`sm.GLM()` is similar to that of `sm.OLS()`, except
|
104 |
+
that we must pass in the argument `family=sm.families.Binomial()`
|
105 |
+
in order to tell `statsmodels` to run a logistic regression rather than some other
|
106 |
+
type of generalized linear model.
|
107 |
+
"""
|
108 |
+
|
109 |
+
allvars = Smarket.columns.drop(['Today', 'Direction', 'Year'])
|
110 |
+
design = MS(allvars)
|
111 |
+
X = design.fit_transform(Smarket)
|
112 |
+
y = Smarket.Direction == 'Up'
|
113 |
+
glm = sm.GLM(y,
|
114 |
+
X,
|
115 |
+
family=sm.families.Binomial())
|
116 |
+
results = glm.fit()
|
117 |
+
summarize(results)
|
118 |
+
|
119 |
+
"""The smallest *p*-value here is associated with `Lag1`. The
|
120 |
+
negative coefficient for this predictor suggests that if the market
|
121 |
+
had a positive return yesterday, then it is less likely to go up
|
122 |
+
today. However, at a value of 0.15, the *p*-value is still
|
123 |
+
relatively large, and so there is no clear evidence of a real
|
124 |
+
association between `Lag1` and `Direction`.
|
125 |
+
|
126 |
+
We use the `params` attribute of `results`
|
127 |
+
in order to access just the
|
128 |
+
coefficients for this fitted model.
|
129 |
+
"""
|
130 |
+
|
131 |
+
results.params
|
132 |
+
|
133 |
+
"""Likewise we can use the
|
134 |
+
`pvalues` attribute to access the *p*-values for the coefficients.
|
135 |
+
"""
|
136 |
+
|
137 |
+
results.pvalues
|
138 |
+
|
139 |
+
"""The `predict()` method of `results` can be used to predict the
|
140 |
+
probability that the market will go up, given values of the
|
141 |
+
predictors. This method returns predictions
|
142 |
+
on the probability scale. If no data set is supplied to the `predict()`
|
143 |
+
function, then the probabilities are computed for the training data
|
144 |
+
that was used to fit the logistic regression model.
|
145 |
+
As with linear regression, one can pass an optional `exog` argument consistent
|
146 |
+
with a design matrix if desired. Here we have
|
147 |
+
printed only the first ten probabilities.
|
148 |
+
"""
|
149 |
+
|
150 |
+
probs = results.predict()
|
151 |
+
probs[:10]
|
152 |
+
|
153 |
+
"""In order to make a prediction as to whether the market will go up or
|
154 |
+
down on a particular day, we must convert these predicted
|
155 |
+
probabilities into class labels, `Up` or `Down`. The
|
156 |
+
following two commands create a vector of class predictions based on
|
157 |
+
whether the predicted probability of a market increase is greater than
|
158 |
+
or less than 0.5.
|
159 |
+
"""
|
160 |
+
|
161 |
+
labels = np.array(['Down']*1250)
|
162 |
+
labels[probs>0.5] = "Up"
|
163 |
+
|
164 |
+
"""The `confusion_table()`
|
165 |
+
function from the `ISLP` package summarizes these predictions, showing how
|
166 |
+
many observations were correctly or incorrectly classified. Our function, which is adapted from a similar function
|
167 |
+
in the module `sklearn.metrics`, transposes the resulting
|
168 |
+
matrix and includes row and column labels.
|
169 |
+
The `confusion_table()` function takes as first argument the
|
170 |
+
predicted labels, and second argument the true labels.
|
171 |
+
"""
|
172 |
+
|
173 |
+
confusion_table(labels, Smarket.Direction)
|
174 |
+
|
175 |
+
"""The diagonal elements of the confusion matrix indicate correct
|
176 |
+
predictions, while the off-diagonals represent incorrect
|
177 |
+
predictions. Hence our model correctly predicted that the market would
|
178 |
+
go up on 507 days and that it would go down on 145 days, for a
|
179 |
+
total of 507 + 145 = 652 correct predictions. The `np.mean()`
|
180 |
+
function can be used to compute the fraction of days for which the
|
181 |
+
prediction was correct. In this case, logistic regression correctly
|
182 |
+
predicted the movement of the market 52.2% of the time.
|
183 |
+
|
184 |
+
"""
|
185 |
+
|
186 |
+
(507+145)/1250, np.mean(labels == Smarket.Direction)
|
187 |
+
|
188 |
+
"""At first glance, it appears that the logistic regression model is
|
189 |
+
working a little better than random guessing. However, this result is
|
190 |
+
misleading because we trained and tested the model on the same set of
|
191 |
+
1,250 observations. In other words, $100-52.2=47.8%$ is the
|
192 |
+
*training* error rate. As we have seen
|
193 |
+
previously, the training error rate is often overly optimistic --- it
|
194 |
+
tends to underestimate the test error rate. In
|
195 |
+
order to better assess the accuracy of the logistic regression model
|
196 |
+
in this setting, we can fit the model using part of the data, and
|
197 |
+
then examine how well it predicts the *held out* data. This
|
198 |
+
will yield a more realistic error rate, in the sense that in practice
|
199 |
+
we will be interested in our model’s performance not on the data that
|
200 |
+
we used to fit the model, but rather on days in the future for which
|
201 |
+
the market’s movements are unknown.
|
202 |
+
|
203 |
+
To implement this strategy, we first create a Boolean vector
|
204 |
+
corresponding to the observations from 2001 through 2004. We then
|
205 |
+
use this vector to create a held out data set of observations from
|
206 |
+
2005.
|
207 |
+
"""
|
208 |
+
|
209 |
+
train = (Smarket.Year < 2005)
|
210 |
+
Smarket_train = Smarket.loc[train]
|
211 |
+
Smarket_test = Smarket.loc[~train]
|
212 |
+
Smarket_test.shape
|
213 |
+
|
214 |
+
"""The object `train` is a vector of 1,250 elements, corresponding
|
215 |
+
to the observations in our data set. The elements of the vector that
|
216 |
+
correspond to observations that occurred before 2005 are set to
|
217 |
+
`True`, whereas those that correspond to observations in 2005 are
|
218 |
+
set to `False`. Hence `train` is a
|
219 |
+
*boolean* array, since its
|
220 |
+
elements are `True` and `False`. Boolean arrays can be used
|
221 |
+
to obtain a subset of the rows or columns of a data frame
|
222 |
+
using the `loc` method. For instance,
|
223 |
+
the command `Smarket.loc[train]` would pick out a submatrix of the
|
224 |
+
stock market data set, corresponding only to the dates before 2005,
|
225 |
+
since those are the ones for which the elements of `train` are
|
226 |
+
`True`. The `~` symbol can be used to negate all of the
|
227 |
+
elements of a Boolean vector. That is, `~train` is a vector
|
228 |
+
similar to `train`, except that the elements that are `True`
|
229 |
+
in `train` get swapped to `False` in `~train`, and vice versa.
|
230 |
+
Therefore, `Smarket.loc[~train]` yields a
|
231 |
+
subset of the rows of the data frame
|
232 |
+
of the stock market data containing only the observations for which
|
233 |
+
`train` is `False`.
|
234 |
+
The output above indicates that there are 252 such
|
235 |
+
observations.
|
236 |
+
|
237 |
+
We now fit a logistic regression model using only the subset of the
|
238 |
+
observations that correspond to dates before 2005. We then obtain predicted probabilities of the
|
239 |
+
stock market going up for each of the days in our test set --- that is,
|
240 |
+
for the days in 2005.
|
241 |
+
"""
|
242 |
+
|
243 |
+
X_train, X_test = X.loc[train], X.loc[~train]
|
244 |
+
y_train, y_test = y.loc[train], y.loc[~train]
|
245 |
+
glm_train = sm.GLM(y_train,
|
246 |
+
X_train,
|
247 |
+
family=sm.families.Binomial())
|
248 |
+
results = glm_train.fit()
|
249 |
+
probs = results.predict(exog=X_test)
|
250 |
+
|
251 |
+
"""Notice that we have trained and tested our model on two completely
|
252 |
+
separate data sets: training was performed using only the dates before
|
253 |
+
2005, and testing was performed using only the dates in 2005.
|
254 |
+
|
255 |
+
Finally, we compare the predictions for 2005 to the
|
256 |
+
actual movements of the market over that time period.
|
257 |
+
We will first store the test and training labels (recall `y_test` is binary).
|
258 |
+
"""
|
259 |
+
|
260 |
+
D = Smarket.Direction
|
261 |
+
L_train, L_test = D.loc[train], D.loc[~train]
|
262 |
+
|
263 |
+
"""Now we threshold the
|
264 |
+
fitted probability at 50% to form
|
265 |
+
our predicted labels.
|
266 |
+
"""
|
267 |
+
|
268 |
+
labels = np.array(['Down']*252)
|
269 |
+
labels[probs>0.5] = 'Up'
|
270 |
+
confusion_table(labels, L_test)
|
271 |
+
|
272 |
+
"""The test accuracy is about 48% while the error rate is about 52%"""
|
273 |
+
|
274 |
+
np.mean(labels == L_test), np.mean(labels != L_test)
|
275 |
+
|
276 |
+
"""The `!=` notation means *not equal to*, and so the last command
|
277 |
+
computes the test set error rate. The results are rather
|
278 |
+
disappointing: the test error rate is 52%, which is worse than
|
279 |
+
random guessing! Of course this result is not all that surprising,
|
280 |
+
given that one would not generally expect to be able to use previous
|
281 |
+
days’ returns to predict future market performance. (After all, if it
|
282 |
+
were possible to do so, then the authors of this book would be out
|
283 |
+
striking it rich rather than writing a statistics textbook.)
|
284 |
+
|
285 |
+
We recall that the logistic regression model had very underwhelming
|
286 |
+
*p*-values associated with all of the predictors, and that the
|
287 |
+
smallest *p*-value, though not very small, corresponded to
|
288 |
+
`Lag1`. Perhaps by removing the variables that appear not to be
|
289 |
+
helpful in predicting `Direction`, we can obtain a more
|
290 |
+
effective model. After all, using predictors that have no relationship
|
291 |
+
with the response tends to cause a deterioration in the test error
|
292 |
+
rate (since such predictors cause an increase in variance without a
|
293 |
+
corresponding decrease in bias), and so removing such predictors may
|
294 |
+
in turn yield an improvement. Below we refit the logistic
|
295 |
+
regression using just `Lag1` and `Lag2`, which seemed to
|
296 |
+
have the highest predictive power in the original logistic regression
|
297 |
+
model.
|
298 |
+
"""
|
299 |
+
|
300 |
+
model = MS(['Lag1', 'Lag2']).fit(Smarket)
|
301 |
+
X = model.transform(Smarket)
|
302 |
+
X_train, X_test = X.loc[train], X.loc[~train]
|
303 |
+
glm_train = sm.GLM(y_train,
|
304 |
+
X_train,
|
305 |
+
family=sm.families.Binomial())
|
306 |
+
results = glm_train.fit()
|
307 |
+
probs = results.predict(exog=X_test)
|
308 |
+
labels = np.array(['Down']*252)
|
309 |
+
labels[probs>0.5] = 'Up'
|
310 |
+
confusion_table(labels, L_test)
|
311 |
+
|
312 |
+
"""Let’s evaluate the overall accuracy as well as the accuracy within the days when
|
313 |
+
logistic regression predicts an increase.
|
314 |
+
"""
|
315 |
+
|
316 |
+
(35+106)/252,106/(106+76)
|
317 |
+
|
318 |
+
"""Now the results appear to be a little better: 56% of the daily
|
319 |
+
movements have been correctly predicted. It is worth noting that in
|
320 |
+
this case, a much simpler strategy of predicting that the market will
|
321 |
+
increase every day will also be correct 56% of the time! Hence, in
|
322 |
+
terms of overall error rate, the logistic regression method is no
|
323 |
+
better than the naive approach. However, the confusion matrix
|
324 |
+
shows that on days when logistic regression predicts an increase in
|
325 |
+
the market, it has a 58% accuracy rate. This suggests a possible
|
326 |
+
trading strategy of buying on days when the model predicts an
|
327 |
+
increasing market, and avoiding trades on days when a decrease is
|
328 |
+
predicted. Of course one would need to investigate more carefully
|
329 |
+
whether this small improvement was real or just due to random chance.
|
330 |
+
|
331 |
+
Suppose that we want to predict the returns associated with particular
|
332 |
+
values of `Lag1` and `Lag2`. In particular, we want to
|
333 |
+
predict `Direction` on a day when `Lag1` and
|
334 |
+
`Lag2` equal $1.2$ and $1.1$, respectively, and on a day when they
|
335 |
+
equal $1.5$ and $-0.8$. We do this using the `predict()`
|
336 |
+
function.
|
337 |
+
"""
|
338 |
+
|
339 |
+
newdata = pd.DataFrame({'Lag1':[1.2, 1.5],
|
340 |
+
'Lag2':[1.1, -0.8]});
|
341 |
+
newX = model.transform(newdata)
|
342 |
+
results.predict(newX)
|
343 |
+
|
344 |
+
Smarket
|
345 |
+
|
346 |
+
import pandas as pd
|
347 |
+
import numpy as np
|
348 |
+
import matplotlib.pyplot as plt
|
349 |
+
from sklearn.model_selection import train_test_split
|
350 |
+
from sklearn.linear_model import LogisticRegression
|
351 |
+
from sklearn.metrics import classification_report, confusion_matrix
|
352 |
+
import statsmodels.api as sm
|
353 |
+
|
354 |
+
|
355 |
+
# Load the dataset
|
356 |
+
data = load_data('Smarket')
|
357 |
+
|
358 |
+
# Display the first few rows of the dataset
|
359 |
+
print(data.head())
|
360 |
+
|
361 |
+
# Prepare the data for logistic regression
|
362 |
+
# Using 'Lag1' and 'Lag2' as predictors and 'Direction' as the response
|
363 |
+
data['Direction'] = data['Direction'].map({'Up': 1, 'Down': 0})
|
364 |
+
X = data[['Lag1', 'Lag2']]
|
365 |
+
y = data['Direction']
|
366 |
+
|
367 |
+
# Split the data into training and testing sets
|
368 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
|
369 |
+
|
370 |
+
# Fit the logistic regression model
|
371 |
+
log_reg = LogisticRegression()
|
372 |
+
log_reg.fit(X_train, y_train)
|
373 |
+
|
374 |
+
# Make predictions on the test set
|
375 |
+
y_pred = log_reg.predict(X_test)
|
376 |
+
|
377 |
+
# Print classification report and confusion matrix
|
378 |
+
print(classification_report(y_test, y_pred))
|
379 |
+
print(confusion_matrix(y_test, y_pred))
|
380 |
+
|
381 |
+
# Visualize the decision boundary
|
382 |
+
plt.figure(figsize=(10, 6))
|
383 |
+
|
384 |
+
# Create a mesh grid for plotting decision boundary
|
385 |
+
x_min, x_max = X['Lag1'].min() - 1, X['Lag1'].max() + 1
|
386 |
+
y_min, y_max = X['Lag2'].min() - 1, X['Lag2'].max() + 1
|
387 |
+
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
|
388 |
+
np.arange(y_min, y_max, 0.01))
|
389 |
+
|
390 |
+
# Predict the function value for the whole grid
|
391 |
+
Z = log_reg.predict(np.c_[xx.ravel(), yy.ravel()])
|
392 |
+
Z = Z.reshape(xx.shape)
|
393 |
+
|
394 |
+
# Plot the decision boundary
|
395 |
+
plt.contourf(xx, yy, Z, alpha=0.8)
|
396 |
+
plt.scatter(X_test['Lag1'], X_test['Lag2'], c=y_test, edgecolor='k', s=20)
|
397 |
+
plt.xlabel('Lag1')
|
398 |
+
plt.ylabel('Lag2')
|
399 |
+
plt.title('Logistic Regression Decision Boundary')
|
400 |
+
plt.show()
|
app/__pycache__/main.cpython-311.pyc
CHANGED
Binary files a/app/__pycache__/main.cpython-311.pyc and b/app/__pycache__/main.cpython-311.pyc differ
|
|
app/main.py
CHANGED
@@ -8,8 +8,7 @@ from sklearn.linear_model import LinearRegression
|
|
8 |
import nltk
|
9 |
from nltk.corpus import stopwords
|
10 |
from nltk.tokenize import word_tokenize, sent_tokenize
|
11 |
-
|
12 |
-
nltk.download('stopwords')
|
13 |
|
14 |
# Add the parent directory to the Python path
|
15 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
@@ -23,6 +22,7 @@ from app.pages import week_2
|
|
23 |
from app.pages import week_3
|
24 |
from app.pages import week_4
|
25 |
from app.pages import week_5
|
|
|
26 |
# Page configuration
|
27 |
st.set_page_config(
|
28 |
page_title="Data Science Course App",
|
@@ -149,6 +149,8 @@ def show_week_content():
|
|
149 |
week_4.show()
|
150 |
elif st.session_state.current_week == 5:
|
151 |
week_5.show()
|
|
|
|
|
152 |
else:
|
153 |
st.warning("Content for this week is not yet available.")
|
154 |
|
@@ -161,14 +163,14 @@ def main():
|
|
161 |
return
|
162 |
|
163 |
# User is logged in, show course content
|
164 |
-
if st.session_state.current_week in [1, 2, 3, 4, 5]:
|
165 |
show_week_content()
|
166 |
else:
|
167 |
st.title("Data Science Research Paper Course")
|
168 |
st.markdown("""
|
169 |
## Welcome to the Data Science Research Paper Course! 📚
|
170 |
|
171 |
-
This section has not
|
172 |
""")
|
173 |
|
174 |
if __name__ == "__main__":
|
|
|
8 |
import nltk
|
9 |
from nltk.corpus import stopwords
|
10 |
from nltk.tokenize import word_tokenize, sent_tokenize
|
11 |
+
|
|
|
12 |
|
13 |
# Add the parent directory to the Python path
|
14 |
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
22 |
from app.pages import week_3
|
23 |
from app.pages import week_4
|
24 |
from app.pages import week_5
|
25 |
+
from app.pages import week_6
|
26 |
# Page configuration
|
27 |
st.set_page_config(
|
28 |
page_title="Data Science Course App",
|
|
|
149 |
week_4.show()
|
150 |
elif st.session_state.current_week == 5:
|
151 |
week_5.show()
|
152 |
+
elif st.session_state.current_week == 6:
|
153 |
+
week_6.show()
|
154 |
else:
|
155 |
st.warning("Content for this week is not yet available.")
|
156 |
|
|
|
163 |
return
|
164 |
|
165 |
# User is logged in, show course content
|
166 |
+
if st.session_state.current_week in [1, 2, 3, 4, 5, 6]:
|
167 |
show_week_content()
|
168 |
else:
|
169 |
st.title("Data Science Research Paper Course")
|
170 |
st.markdown("""
|
171 |
## Welcome to the Data Science Research Paper Course! 📚
|
172 |
|
173 |
+
This section has not been released yet.
|
174 |
""")
|
175 |
|
176 |
if __name__ == "__main__":
|
app/pages/__pycache__/week_2.cpython-311.pyc
CHANGED
Binary files a/app/pages/__pycache__/week_2.cpython-311.pyc and b/app/pages/__pycache__/week_2.cpython-311.pyc differ
|
|
app/pages/__pycache__/week_5.cpython-311.pyc
CHANGED
Binary files a/app/pages/__pycache__/week_5.cpython-311.pyc and b/app/pages/__pycache__/week_5.cpython-311.pyc differ
|
|
app/pages/__pycache__/week_6.cpython-311.pyc
ADDED
Binary file (34.6 kB). View file
|
|
app/pages/week_6.py
ADDED
@@ -0,0 +1,803 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
from sklearn.model_selection import train_test_split
|
7 |
+
from sklearn.linear_model import LogisticRegression
|
8 |
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
9 |
+
from sklearn.preprocessing import StandardScaler
|
10 |
+
import plotly.express as px
|
11 |
+
import plotly.graph_objects as go
|
12 |
+
from plotly.subplots import make_subplots
|
13 |
+
import scipy.stats as stats
|
14 |
+
from pathlib import Path
|
15 |
+
import statsmodels.api as sm
|
16 |
+
from ISLP import load_data
|
17 |
+
from ISLP.models import ModelSpec as MS, summarize
|
18 |
+
|
19 |
+
# Set up the style for all plots
|
20 |
+
plt.style.use('default')
|
21 |
+
sns.set_theme(style="whitegrid", palette="husl")
|
22 |
+
|
23 |
+
def load_smarket_data():
|
24 |
+
"""Load and prepare the Smarket data"""
|
25 |
+
try:
|
26 |
+
Smarket = load_data('Smarket')
|
27 |
+
return Smarket
|
28 |
+
except Exception as e:
|
29 |
+
st.error(f"Error loading Smarket data: {str(e)}")
|
30 |
+
return None
|
31 |
+
|
32 |
+
def create_confusion_matrix_plot(y_true, y_pred, title="Confusion Matrix"):
|
33 |
+
"""Create an interactive confusion matrix plot"""
|
34 |
+
cm = confusion_matrix(y_true, y_pred)
|
35 |
+
fig = go.Figure(data=go.Heatmap(
|
36 |
+
z=cm,
|
37 |
+
x=['Predicted Down', 'Predicted Up'],
|
38 |
+
y=['Actual Down', 'Actual Up'],
|
39 |
+
colorscale='RdBu',
|
40 |
+
text=[[str(val) for val in row] for row in cm],
|
41 |
+
texttemplate='%{text}',
|
42 |
+
textfont={"size": 16}
|
43 |
+
))
|
44 |
+
|
45 |
+
fig.update_layout(
|
46 |
+
title=title,
|
47 |
+
title_x=0.5,
|
48 |
+
title_font_size=20,
|
49 |
+
plot_bgcolor='rgb(30, 30, 30)',
|
50 |
+
paper_bgcolor='rgb(30, 30, 30)',
|
51 |
+
font=dict(color='white')
|
52 |
+
)
|
53 |
+
return fig
|
54 |
+
|
55 |
+
def create_correlation_heatmap(df):
|
56 |
+
"""Create a correlation heatmap using plotly"""
|
57 |
+
corr = df.corr(numeric_only=True)
|
58 |
+
|
59 |
+
fig = go.Figure(data=go.Heatmap(
|
60 |
+
z=corr,
|
61 |
+
x=corr.columns,
|
62 |
+
y=corr.columns,
|
63 |
+
colorscale='RdBu',
|
64 |
+
zmin=-1, zmax=1,
|
65 |
+
text=[[f'{val:.2f}' for val in row] for row in corr.values],
|
66 |
+
texttemplate='%{text}',
|
67 |
+
textfont={"size": 12}
|
68 |
+
))
|
69 |
+
|
70 |
+
fig.update_layout(
|
71 |
+
title='S&P 500 Returns Correlation Heatmap',
|
72 |
+
title_x=0.5,
|
73 |
+
title_font_size=20,
|
74 |
+
plot_bgcolor='rgb(30, 30, 30)',
|
75 |
+
paper_bgcolor='rgb(30, 30, 30)',
|
76 |
+
font=dict(color='white')
|
77 |
+
)
|
78 |
+
return fig
|
79 |
+
|
80 |
+
def create_decision_boundary_plot(X, y, model):
|
81 |
+
"""Create an interactive decision boundary plot using plotly"""
|
82 |
+
# Create a mesh grid
|
83 |
+
x_min, x_max = X['Lag1'].min() - 1, X['Lag1'].max() + 1
|
84 |
+
y_min, y_max = X['Lag2'].min() - 1, X['Lag2'].max() + 1
|
85 |
+
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
|
86 |
+
np.arange(y_min, y_max, 0.01))
|
87 |
+
|
88 |
+
# Get predictions for the mesh grid
|
89 |
+
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
|
90 |
+
Z = Z.reshape(xx.shape)
|
91 |
+
|
92 |
+
# Create the plot
|
93 |
+
fig = go.Figure()
|
94 |
+
|
95 |
+
# Add the decision boundary
|
96 |
+
fig.add_trace(go.Contour(
|
97 |
+
x=np.arange(x_min, x_max, 0.01),
|
98 |
+
y=np.arange(y_min, y_max, 0.01),
|
99 |
+
z=Z,
|
100 |
+
colorscale='RdBu',
|
101 |
+
showscale=False,
|
102 |
+
opacity=0.5
|
103 |
+
))
|
104 |
+
|
105 |
+
# Add the scatter points
|
106 |
+
fig.add_trace(go.Scatter(
|
107 |
+
x=X['Lag1'],
|
108 |
+
y=X['Lag2'],
|
109 |
+
mode='markers',
|
110 |
+
marker=dict(
|
111 |
+
color=y,
|
112 |
+
colorscale='RdBu',
|
113 |
+
size=8,
|
114 |
+
line=dict(color='black', width=1)
|
115 |
+
),
|
116 |
+
name='Data Points'
|
117 |
+
))
|
118 |
+
|
119 |
+
# Update layout
|
120 |
+
fig.update_layout(
|
121 |
+
title='Logistic Regression Decision Boundary',
|
122 |
+
xaxis_title='Lag1',
|
123 |
+
yaxis_title='Lag2',
|
124 |
+
plot_bgcolor='rgb(30, 30, 30)',
|
125 |
+
paper_bgcolor='rgb(30, 30, 30)',
|
126 |
+
font=dict(color='white'),
|
127 |
+
showlegend=False
|
128 |
+
)
|
129 |
+
|
130 |
+
return fig
|
131 |
+
|
132 |
+
def show():
|
133 |
+
st.title("Week 6: Logistic Regression and Stock Market Prediction")
|
134 |
+
|
135 |
+
# Introduction Section
|
136 |
+
st.header("Course Overview")
|
137 |
+
st.write("""
|
138 |
+
In this week, we'll use logistic regression to try predicting whether the stock market goes up or down.
|
139 |
+
This is intentionally a challenging prediction problem that will teach us important lessons about:
|
140 |
+
- When logistic regression works well and when it doesn't
|
141 |
+
- How to interpret probabilities and coefficients
|
142 |
+
- Why some prediction problems are inherently difficult
|
143 |
+
- Proper model evaluation techniques
|
144 |
+
""")
|
145 |
+
|
146 |
+
# Learning Path
|
147 |
+
st.subheader("Learning Path")
|
148 |
+
st.write("""
|
149 |
+
1. Understanding the Stock Market Data: S&P 500 returns and predictors
|
150 |
+
2. Logistic Regression Fundamentals: From linear to logistic
|
151 |
+
3. Model Training and Evaluation: Proper train-test splitting
|
152 |
+
4. Interpreting Results: Coefficients and probabilities
|
153 |
+
5. Model Assessment: Confusion matrices and metrics
|
154 |
+
6. Real-world Applications: Challenges and limitations
|
155 |
+
""")
|
156 |
+
|
157 |
+
# Module 1: Understanding the Data
|
158 |
+
st.header("Module 1: Understanding the Stock Market Data")
|
159 |
+
st.write("""
|
160 |
+
We'll examine the Smarket data, which consists of percentage returns for the S&P 500 stock index over 1,250 days,
|
161 |
+
from the beginning of 2001 until the end of 2005. For each date, we have:
|
162 |
+
- Percentage returns for each of the five previous trading days (Lag1 through Lag5)
|
163 |
+
- Volume (number of shares traded on the previous day, in billions)
|
164 |
+
- Today (percentage return on the date in question)
|
165 |
+
- Direction (whether the market was Up or Down on this date)
|
166 |
+
""")
|
167 |
+
|
168 |
+
# Load and display data
|
169 |
+
Smarket = load_smarket_data()
|
170 |
+
if Smarket is not None:
|
171 |
+
st.write("First few rows of the Smarket data:")
|
172 |
+
st.dataframe(Smarket.head())
|
173 |
+
|
174 |
+
# EDA Plots
|
175 |
+
st.subheader("Exploratory Data Analysis")
|
176 |
+
|
177 |
+
# Volume over time
|
178 |
+
st.write("**Trading Volume Over Time**")
|
179 |
+
fig_volume = go.Figure()
|
180 |
+
fig_volume.add_trace(go.Scatter(
|
181 |
+
x=Smarket.index,
|
182 |
+
y=Smarket['Volume'],
|
183 |
+
mode='lines',
|
184 |
+
name='Volume'
|
185 |
+
))
|
186 |
+
fig_volume.update_layout(
|
187 |
+
title='Trading Volume Over Time',
|
188 |
+
xaxis_title='Time',
|
189 |
+
yaxis_title='Volume (billions of shares)',
|
190 |
+
plot_bgcolor='rgb(30, 30, 30)',
|
191 |
+
paper_bgcolor='rgb(30, 30, 30)',
|
192 |
+
font=dict(color='white')
|
193 |
+
)
|
194 |
+
st.plotly_chart(fig_volume)
|
195 |
+
|
196 |
+
# Returns distribution
|
197 |
+
st.write("**Distribution of Returns**")
|
198 |
+
|
199 |
+
# Add column selection
|
200 |
+
selected_columns = st.multiselect(
|
201 |
+
"Select columns to display",
|
202 |
+
options=['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Today'],
|
203 |
+
default=['Lag1', 'Lag2']
|
204 |
+
)
|
205 |
+
|
206 |
+
if selected_columns:
|
207 |
+
fig_returns = go.Figure()
|
208 |
+
for col in selected_columns:
|
209 |
+
fig_returns.add_trace(go.Histogram(
|
210 |
+
x=Smarket[col],
|
211 |
+
name=col,
|
212 |
+
opacity=0.7,
|
213 |
+
nbinsx=50 # Adjust number of bins for better visualization
|
214 |
+
))
|
215 |
+
|
216 |
+
# Add mean and std lines
|
217 |
+
for col in selected_columns:
|
218 |
+
mean_val = Smarket[col].mean()
|
219 |
+
std_val = Smarket[col].std()
|
220 |
+
fig_returns.add_vline(
|
221 |
+
x=mean_val,
|
222 |
+
line_dash="dash",
|
223 |
+
line_color="red",
|
224 |
+
annotation_text=f"{col} Mean: {mean_val:.2f}%",
|
225 |
+
annotation_position="top right",
|
226 |
+
annotation=dict(
|
227 |
+
textangle=-45,
|
228 |
+
font=dict(size=10)
|
229 |
+
)
|
230 |
+
)
|
231 |
+
fig_returns.add_vline(
|
232 |
+
x=mean_val + std_val,
|
233 |
+
line_dash="dot",
|
234 |
+
line_color="yellow",
|
235 |
+
annotation_text=f"{col} +1σ: {mean_val + std_val:.2f}%",
|
236 |
+
annotation_position="top right",
|
237 |
+
annotation=dict(
|
238 |
+
textangle=-45,
|
239 |
+
font=dict(size=10)
|
240 |
+
)
|
241 |
+
)
|
242 |
+
fig_returns.add_vline(
|
243 |
+
x=mean_val - std_val,
|
244 |
+
line_dash="dot",
|
245 |
+
line_color="yellow",
|
246 |
+
annotation_text=f"{col} -1σ: {mean_val - std_val:.2f}%",
|
247 |
+
annotation_position="top right",
|
248 |
+
annotation=dict(
|
249 |
+
textangle=-45,
|
250 |
+
font=dict(size=10)
|
251 |
+
)
|
252 |
+
)
|
253 |
+
|
254 |
+
fig_returns.update_layout(
|
255 |
+
title='Distribution of Returns',
|
256 |
+
xaxis_title='Return (%)',
|
257 |
+
yaxis_title='Frequency',
|
258 |
+
barmode='overlay',
|
259 |
+
plot_bgcolor='rgb(30, 30, 30)',
|
260 |
+
paper_bgcolor='rgb(30, 30, 30)',
|
261 |
+
font=dict(color='white'),
|
262 |
+
showlegend=True,
|
263 |
+
legend=dict(
|
264 |
+
yanchor="top",
|
265 |
+
y=0.99,
|
266 |
+
xanchor="left",
|
267 |
+
x=0.01
|
268 |
+
)
|
269 |
+
)
|
270 |
+
|
271 |
+
# Add summary statistics
|
272 |
+
st.write("**Summary Statistics**")
|
273 |
+
summary_stats = Smarket[selected_columns].describe()
|
274 |
+
st.dataframe(summary_stats.style.format('{:.2f}'))
|
275 |
+
|
276 |
+
st.plotly_chart(fig_returns)
|
277 |
+
|
278 |
+
# Add interpretation
|
279 |
+
st.write("""
|
280 |
+
**Interpretation:**
|
281 |
+
- The dashed red line shows the mean return for each selected period
|
282 |
+
- The dotted yellow lines show one standard deviation above and below the mean
|
283 |
+
- The overlap of distributions helps identify similarities in return patterns
|
284 |
+
- Wider distributions indicate higher volatility
|
285 |
+
""")
|
286 |
+
|
287 |
+
# Returns over time
|
288 |
+
st.write("**Returns Over Time**")
|
289 |
+
fig_returns_time = go.Figure()
|
290 |
+
fig_returns_time.add_trace(go.Scatter(
|
291 |
+
x=Smarket.index,
|
292 |
+
y=Smarket['Today'],
|
293 |
+
mode='lines',
|
294 |
+
name='Today\'s Return'
|
295 |
+
))
|
296 |
+
fig_returns_time.update_layout(
|
297 |
+
title='Daily Returns Over Time',
|
298 |
+
xaxis_title='Time',
|
299 |
+
yaxis_title='Return (%)',
|
300 |
+
plot_bgcolor='rgb(30, 30, 30)',
|
301 |
+
paper_bgcolor='rgb(30, 30, 30)',
|
302 |
+
font=dict(color='white')
|
303 |
+
)
|
304 |
+
st.plotly_chart(fig_returns_time)
|
305 |
+
|
306 |
+
# Direction distribution
|
307 |
+
st.write("**Market Direction Distribution**")
|
308 |
+
direction_counts = Smarket['Direction'].value_counts()
|
309 |
+
fig_direction = go.Figure(data=[go.Pie(
|
310 |
+
labels=direction_counts.index,
|
311 |
+
values=direction_counts.values,
|
312 |
+
hole=.3
|
313 |
+
)])
|
314 |
+
fig_direction.update_layout(
|
315 |
+
title='Distribution of Market Direction',
|
316 |
+
plot_bgcolor='rgb(30, 30, 30)',
|
317 |
+
paper_bgcolor='rgb(30, 30, 30)',
|
318 |
+
font=dict(color='white')
|
319 |
+
)
|
320 |
+
st.plotly_chart(fig_direction)
|
321 |
+
|
322 |
+
# Show correlation heatmap
|
323 |
+
st.write("**Correlation Analysis**")
|
324 |
+
st.plotly_chart(create_correlation_heatmap(Smarket))
|
325 |
+
|
326 |
+
st.write("""
|
327 |
+
Key observations from the exploratory analysis:
|
328 |
+
|
329 |
+
1. **Trading Volume**:
|
330 |
+
- Shows an increasing trend over time
|
331 |
+
- Higher volatility in recent years
|
332 |
+
- Some periods of unusually high volume
|
333 |
+
|
334 |
+
2. **Returns Distribution**:
|
335 |
+
- Approximately normal distribution
|
336 |
+
- Most returns are close to zero
|
337 |
+
- Some extreme values (outliers)
|
338 |
+
|
339 |
+
3. **Market Direction**:
|
340 |
+
- Relatively balanced between Up and Down days
|
341 |
+
- Slight bias towards Up days
|
342 |
+
|
343 |
+
4. **Correlations**:
|
344 |
+
- Low correlation between lagged returns
|
345 |
+
- Strong correlation between Year and Volume
|
346 |
+
- Today's return shows little correlation with past returns
|
347 |
+
""")
|
348 |
+
|
349 |
+
# Module 2: Logistic Regression Implementation
|
350 |
+
st.header("Module 2: Logistic Regression Implementation")
|
351 |
+
st.write("""
|
352 |
+
We'll fit a logistic regression model to predict Direction using Lag1 through Lag5 and Volume.
|
353 |
+
The model will help us understand if we can predict market movements based on recent trading patterns.
|
354 |
+
""")
|
355 |
+
|
356 |
+
if Smarket is not None:
|
357 |
+
# Prepare data for logistic regression
|
358 |
+
allvars = Smarket.columns.drop(['Today', 'Direction', 'Year'])
|
359 |
+
design = MS(allvars)
|
360 |
+
X = design.fit_transform(Smarket)
|
361 |
+
y = Smarket.Direction == 'Up'
|
362 |
+
|
363 |
+
# Fit the model
|
364 |
+
glm = sm.GLM(y, X, family=sm.families.Binomial())
|
365 |
+
results = glm.fit()
|
366 |
+
|
367 |
+
# Display model summary
|
368 |
+
st.write("Model Summary:")
|
369 |
+
st.write(summarize(results))
|
370 |
+
|
371 |
+
# Show coefficients
|
372 |
+
st.write("Model Coefficients:")
|
373 |
+
coef_df = pd.DataFrame({
|
374 |
+
'Feature': allvars,
|
375 |
+
'Coefficient': results.params[1:], # Skip the intercept
|
376 |
+
'P-value': results.pvalues[1:] # Skip the intercept
|
377 |
+
})
|
378 |
+
st.write(coef_df)
|
379 |
+
|
380 |
+
# Module 3: Model Evaluation
|
381 |
+
st.header("Module 3: Model Evaluation")
|
382 |
+
st.write("""
|
383 |
+
We'll evaluate our model using proper train-test splitting, focusing on predicting 2005 data using models trained on 2001-2004 data.
|
384 |
+
This gives us a more realistic assessment of model performance.
|
385 |
+
""")
|
386 |
+
|
387 |
+
if Smarket is not None:
|
388 |
+
# Split data by year
|
389 |
+
train = (Smarket.Year < 2005)
|
390 |
+
X_train, X_test = X.loc[train], X.loc[~train]
|
391 |
+
y_train, y_test = y.loc[train], y.loc[~train]
|
392 |
+
|
393 |
+
# Fit model on training data
|
394 |
+
glm_train = sm.GLM(y_train, X_train, family=sm.families.Binomial())
|
395 |
+
results = glm_train.fit()
|
396 |
+
|
397 |
+
# Make predictions
|
398 |
+
probs = results.predict(exog=X_test)
|
399 |
+
labels = np.array(['Down']*len(probs))
|
400 |
+
labels[probs>0.5] = 'Up'
|
401 |
+
|
402 |
+
# Show confusion matrix
|
403 |
+
st.plotly_chart(create_confusion_matrix_plot(Smarket.Direction[~train], labels))
|
404 |
+
|
405 |
+
# Calculate and display accuracy
|
406 |
+
accuracy = np.mean(labels == Smarket.Direction[~train])
|
407 |
+
st.write(f"Test Accuracy: {accuracy:.2%}")
|
408 |
+
|
409 |
+
# Module 4: Decision Boundary Visualization
|
410 |
+
st.header("Module 4: Decision Boundary Visualization")
|
411 |
+
st.write("""
|
412 |
+
Let's visualize how our logistic regression model separates the market movements using Lag1 and Lag2 as predictors.
|
413 |
+
The decision boundary shows how the model classifies different combinations of previous day returns.
|
414 |
+
""")
|
415 |
+
|
416 |
+
if Smarket is not None:
|
417 |
+
# Prepare data for decision boundary plot
|
418 |
+
X_plot = Smarket[['Lag1', 'Lag2']]
|
419 |
+
y_plot = (Smarket['Direction'] == 'Up').astype(int)
|
420 |
+
|
421 |
+
# Fit a simple logistic regression model for visualization
|
422 |
+
log_reg = LogisticRegression()
|
423 |
+
log_reg.fit(X_plot, y_plot)
|
424 |
+
|
425 |
+
# Create and display the decision boundary plot
|
426 |
+
st.plotly_chart(create_decision_boundary_plot(X_plot, y_plot, log_reg))
|
427 |
+
|
428 |
+
st.write("""
|
429 |
+
The decision boundary plot shows:
|
430 |
+
- Blue regions indicate where the model predicts the market will go down
|
431 |
+
- Red regions indicate where the model predicts the market will go up
|
432 |
+
- The boundary between these regions represents where the model is uncertain
|
433 |
+
- The scatter points show actual market movements, colored by their true direction
|
434 |
+
""")
|
435 |
+
|
436 |
+
# Module 5: Interpreting Logistic Regression Results
|
437 |
+
st.header("Module 5: Interpreting Logistic Regression Results")
|
438 |
+
|
439 |
+
st.subheader("Understanding the Coefficients")
|
440 |
+
st.write("""
|
441 |
+
In logistic regression, coefficients tell us about the relationship between predictors and the probability of the outcome.
|
442 |
+
Let's break down how to interpret them:
|
443 |
+
|
444 |
+
1. **Coefficient Sign**:
|
445 |
+
- Positive coefficients increase the probability of the outcome (market going up)
|
446 |
+
- Negative coefficients decrease the probability of the outcome (market going down)
|
447 |
+
|
448 |
+
2. **Coefficient Magnitude**:
|
449 |
+
- Larger absolute values indicate stronger effects
|
450 |
+
- The effect is non-linear due to the logistic function
|
451 |
+
""")
|
452 |
+
|
453 |
+
# Add visualization comparing linear and logistic regression
|
454 |
+
st.write("**Linear vs Logistic Regression**")
|
455 |
+
|
456 |
+
# Create sample data
|
457 |
+
x = np.linspace(-5, 5, 100)
|
458 |
+
y_linear = 0.5 * x + 0.5 # Linear regression
|
459 |
+
y_logistic = 1 / (1 + np.exp(-(2 * x))) # Logistic regression with steeper slope
|
460 |
+
|
461 |
+
# Create the comparison plot
|
462 |
+
fig_comparison = go.Figure()
|
463 |
+
|
464 |
+
# Add linear regression line
|
465 |
+
fig_comparison.add_trace(go.Scatter(
|
466 |
+
x=x,
|
467 |
+
y=y_linear,
|
468 |
+
mode='lines',
|
469 |
+
name='Linear Regression',
|
470 |
+
line=dict(color='blue', width=2)
|
471 |
+
))
|
472 |
+
|
473 |
+
# Add logistic regression curve
|
474 |
+
fig_comparison.add_trace(go.Scatter(
|
475 |
+
x=x,
|
476 |
+
y=y_logistic,
|
477 |
+
mode='lines',
|
478 |
+
name='Logistic Regression',
|
479 |
+
line=dict(color='red', width=2)
|
480 |
+
))
|
481 |
+
|
482 |
+
# Add some sample points with more extreme separation
|
483 |
+
np.random.seed(42)
|
484 |
+
x_samples = np.random.normal(0, 1, 50)
|
485 |
+
# Make the separation more clear
|
486 |
+
y_samples = (x_samples > 0.5).astype(int) # Changed threshold to 0.5 for clearer separation
|
487 |
+
|
488 |
+
fig_comparison.add_trace(go.Scatter(
|
489 |
+
x=x_samples,
|
490 |
+
y=y_samples,
|
491 |
+
mode='markers',
|
492 |
+
name='Sample Data',
|
493 |
+
marker=dict(
|
494 |
+
color=['red' if y == 0 else 'green' for y in y_samples],
|
495 |
+
size=8,
|
496 |
+
symbol='circle'
|
497 |
+
)
|
498 |
+
))
|
499 |
+
|
500 |
+
# Update layout
|
501 |
+
fig_comparison.update_layout(
|
502 |
+
title='Linear vs Logistic Regression',
|
503 |
+
xaxis_title='Input Feature (X)',
|
504 |
+
yaxis_title='Output',
|
505 |
+
plot_bgcolor='rgb(30, 30, 30)',
|
506 |
+
paper_bgcolor='rgb(30, 30, 30)',
|
507 |
+
font=dict(color='white'),
|
508 |
+
showlegend=True,
|
509 |
+
legend=dict(
|
510 |
+
yanchor="top",
|
511 |
+
y=0.99,
|
512 |
+
xanchor="left",
|
513 |
+
x=0.01
|
514 |
+
),
|
515 |
+
yaxis=dict(
|
516 |
+
range=[-0.1, 1.1] # Extend y-axis range slightly
|
517 |
+
)
|
518 |
+
)
|
519 |
+
|
520 |
+
# Add annotations
|
521 |
+
fig_comparison.add_annotation(
|
522 |
+
x=2, y=0.8,
|
523 |
+
text="Linear Regression<br>predicts continuous values",
|
524 |
+
showarrow=True,
|
525 |
+
arrowhead=1,
|
526 |
+
ax=50, ay=-30,
|
527 |
+
font=dict(color='white', size=10)
|
528 |
+
)
|
529 |
+
|
530 |
+
fig_comparison.add_annotation(
|
531 |
+
x=2, y=0.3,
|
532 |
+
text="Logistic Regression<br>predicts probabilities<br>(S-shaped curve)",
|
533 |
+
showarrow=True,
|
534 |
+
arrowhead=1,
|
535 |
+
ax=50, ay=30,
|
536 |
+
font=dict(color='white', size=10)
|
537 |
+
)
|
538 |
+
|
539 |
+
# Add decision boundary annotation
|
540 |
+
fig_comparison.add_annotation(
|
541 |
+
x=0, y=0.5,
|
542 |
+
text="Decision Boundary<br>(p = 0.5)",
|
543 |
+
showarrow=True,
|
544 |
+
arrowhead=1,
|
545 |
+
ax=0, ay=-40,
|
546 |
+
font=dict(color='white', size=10)
|
547 |
+
)
|
548 |
+
|
549 |
+
st.plotly_chart(fig_comparison)
|
550 |
+
|
551 |
+
st.write("""
|
552 |
+
**Key Differences:**
|
553 |
+
|
554 |
+
1. **Output Range**:
|
555 |
+
- Linear Regression: Can predict any value (-∞ to +∞)
|
556 |
+
- Logistic Regression: Predicts probabilities (0 to 1)
|
557 |
+
|
558 |
+
2. **Function Shape**:
|
559 |
+
- Linear Regression: Straight line
|
560 |
+
- Logistic Regression: S-shaped curve (sigmoid)
|
561 |
+
- The sigmoid function creates a sharp transition around the decision boundary
|
562 |
+
|
563 |
+
3. **Use Case**:
|
564 |
+
- Linear Regression: Predicting continuous values
|
565 |
+
- Logistic Regression: Predicting binary outcomes (Up/Down)
|
566 |
+
|
567 |
+
4. **Interpretation**:
|
568 |
+
- Linear Regression: Direct relationship between X and Y
|
569 |
+
- Logistic Regression: Non-linear relationship between X and probability of Y
|
570 |
+
- Small changes in X can lead to large changes in probability near the decision boundary
|
571 |
+
""")
|
572 |
+
|
573 |
+
if Smarket is not None:
|
574 |
+
# Calculate and display coefficients
|
575 |
+
st.subheader("Example: Interpreting Our Model's Coefficients")
|
576 |
+
|
577 |
+
# Get coefficients from the model
|
578 |
+
coef_results = pd.DataFrame({
|
579 |
+
'Feature': allvars,
|
580 |
+
'Coefficient': results.params[1:],
|
581 |
+
'P-value': results.pvalues[1:]
|
582 |
+
})
|
583 |
+
|
584 |
+
st.write("Coefficient Analysis:")
|
585 |
+
st.dataframe(coef_results.style.format({
|
586 |
+
'Coefficient': '{:.4f}',
|
587 |
+
'P-value': '{:.4f}'
|
588 |
+
}))
|
589 |
+
|
590 |
+
st.write("""
|
591 |
+
Let's interpret some examples from our model:
|
592 |
+
|
593 |
+
1. **Lag1 Coefficient**:
|
594 |
+
- A positive coefficient means that higher values of Lag1 are associated with higher probability of the market going up
|
595 |
+
- The magnitude tells us how strong this relationship is
|
596 |
+
|
597 |
+
2. **Volume Coefficient**:
|
598 |
+
- A positive coefficient suggests that higher trading volume is associated with higher probability of upward market movement
|
599 |
+
- The size of the coefficient indicates the strength of this relationship
|
600 |
+
""")
|
601 |
+
|
602 |
+
st.subheader("Understanding Model Performance")
|
603 |
+
st.write("""
|
604 |
+
Our model's performance metrics tell us important information:
|
605 |
+
|
606 |
+
1. **Accuracy**:
|
607 |
+
- The proportion of correct predictions
|
608 |
+
- In our case, around 52% accuracy on the test set
|
609 |
+
- This is slightly better than random guessing (50%)
|
610 |
+
|
611 |
+
2. **Confusion Matrix**:
|
612 |
+
The confusion matrix is a 2x2 table that shows:
|
613 |
+
|
614 |
+
- **True Positives (TP)**:
|
615 |
+
- Correctly predicted market going up
|
616 |
+
- These are the cases where we predicted 'Up' and the market actually went up
|
617 |
+
|
618 |
+
- **False Positives (FP)**:
|
619 |
+
- Incorrectly predicted market going up
|
620 |
+
- These are the cases where we predicted 'Up' but the market actually went down
|
621 |
+
- Also known as Type I errors
|
622 |
+
|
623 |
+
- **True Negatives (TN)**:
|
624 |
+
- Correctly predicted market going down
|
625 |
+
- These are the cases where we predicted 'Down' and the market actually went down
|
626 |
+
|
627 |
+
- **False Negatives (FN)**:
|
628 |
+
- Incorrectly predicted market going down
|
629 |
+
- These are the cases where we predicted 'Down' but the market actually went up
|
630 |
+
- Also known as Type II errors
|
631 |
+
|
632 |
+
From these values, we can calculate important metrics:
|
633 |
+
- **Precision** = TP / (TP + FP): How many of our 'Up' predictions were correct
|
634 |
+
- **Recall** = TP / (TP + FN): How many of the actual 'Up' days did we catch
|
635 |
+
- **F1 Score** = 2 * (Precision * Recall) / (Precision + Recall): Balanced measure of precision and recall
|
636 |
+
- **Accuracy** = (TP + TN) / (TP + TN + FP + FN): Overall correct predictions
|
637 |
+
|
638 |
+
3. **P-values**:
|
639 |
+
- Indicate statistical significance of each predictor
|
640 |
+
- P-value < 0.05 suggests the predictor is significant
|
641 |
+
- In our case, most predictors are not statistically significant
|
642 |
+
""")
|
643 |
+
|
644 |
+
st.subheader("Practical Implications")
|
645 |
+
st.write("""
|
646 |
+
What does this mean for real-world trading?
|
647 |
+
|
648 |
+
1. **Model Limitations**:
|
649 |
+
- The model's accuracy is only slightly better than random guessing
|
650 |
+
- This suggests that predicting market direction is inherently difficult
|
651 |
+
- Past returns alone are not reliable predictors
|
652 |
+
|
653 |
+
2. **Risk Management**:
|
654 |
+
- Even with a model, trading decisions should include:
|
655 |
+
- Stop-loss orders
|
656 |
+
- Position sizing
|
657 |
+
- Diversification
|
658 |
+
- Risk tolerance considerations
|
659 |
+
|
660 |
+
3. **Model Improvement**:
|
661 |
+
- Consider adding more features:
|
662 |
+
- Technical indicators
|
663 |
+
- Market sentiment
|
664 |
+
- Economic indicators
|
665 |
+
- Use more sophisticated models:
|
666 |
+
- Ensemble methods
|
667 |
+
- Deep learning
|
668 |
+
- Time series models
|
669 |
+
""")
|
670 |
+
|
671 |
+
st.subheader("Example: Making a Prediction")
|
672 |
+
st.write("""
|
673 |
+
Let's walk through an example of making a prediction:
|
674 |
+
|
675 |
+
1. **Input Data**:
|
676 |
+
- Lag1 = 1.2% (yesterday's return)
|
677 |
+
- Lag2 = -0.8% (day before yesterday's return)
|
678 |
+
- Volume = 1.1 billion shares
|
679 |
+
|
680 |
+
2. **Calculate Probability**:
|
681 |
+
- Use the logistic function: P(Y=1) = 1 / (1 + e^(-z))
|
682 |
+
- where z = β₀ + β₁(Lag1) + β₂(Lag2) + ... + β₆(Volume)
|
683 |
+
|
684 |
+
3. **Interpret Result**:
|
685 |
+
- If P(Y=1) > 0.5, predict market will go up
|
686 |
+
- If P(Y=1) < 0.5, predict market will go down
|
687 |
+
- The probability itself tells us about confidence
|
688 |
+
""")
|
689 |
+
|
690 |
+
if Smarket is not None:
|
691 |
+
# Example prediction
|
692 |
+
st.write("**Interactive Example:**")
|
693 |
+
col1, col2, col3 = st.columns(3)
|
694 |
+
|
695 |
+
with col1:
|
696 |
+
lag1 = st.number_input("Lag1 (%)", value=1.2, step=0.1)
|
697 |
+
with col2:
|
698 |
+
lag2 = st.number_input("Lag2 (%)", value=-0.8, step=0.1)
|
699 |
+
with col3:
|
700 |
+
volume = st.number_input("Volume (billions)", value=1.1, step=0.1)
|
701 |
+
|
702 |
+
# Make prediction
|
703 |
+
X_example = pd.DataFrame({
|
704 |
+
'Lag1': [lag1],
|
705 |
+
'Lag2': [lag2],
|
706 |
+
'Lag3': [0],
|
707 |
+
'Lag4': [0],
|
708 |
+
'Lag5': [0],
|
709 |
+
'Volume': [volume]
|
710 |
+
})
|
711 |
+
|
712 |
+
# Transform using the same design matrix
|
713 |
+
X_example = design.transform(X_example)
|
714 |
+
prob = results.predict(X_example)[0]
|
715 |
+
|
716 |
+
st.write(f"""
|
717 |
+
**Prediction Results:**
|
718 |
+
- Probability of market going up: {prob:.2%}
|
719 |
+
- Predicted direction: {'Up' if prob > 0.5 else 'Down'}
|
720 |
+
- Confidence level: {abs(prob - 0.5)*2:.2%}
|
721 |
+
""")
|
722 |
+
|
723 |
+
# Practice Exercises
|
724 |
+
st.header("Practice Exercises")
|
725 |
+
|
726 |
+
with st.expander("Exercise 1: Implementing Logistic Regression with Lag1 and Lag2"):
|
727 |
+
st.write("""
|
728 |
+
1. Implement a logistic regression model using only Lag1 and Lag2
|
729 |
+
2. Compare its performance with the full model
|
730 |
+
3. Analyze the coefficients and their significance
|
731 |
+
4. Visualize the results
|
732 |
+
""")
|
733 |
+
|
734 |
+
st.code("""
|
735 |
+
# Solution
|
736 |
+
model = MS(['Lag1', 'Lag2']).fit(Smarket)
|
737 |
+
X = model.transform(Smarket)
|
738 |
+
X_train, X_test = X.loc[train], X.loc[~train]
|
739 |
+
|
740 |
+
glm_train = sm.GLM(y_train, X_train, family=sm.families.Binomial())
|
741 |
+
results = glm_train.fit()
|
742 |
+
|
743 |
+
probs = results.predict(exog=X_test)
|
744 |
+
labels = np.array(['Down']*len(probs))
|
745 |
+
labels[probs>0.5] = 'Up'
|
746 |
+
|
747 |
+
# Evaluate performance
|
748 |
+
accuracy = np.mean(labels == Smarket.Direction[~train])
|
749 |
+
print(f"Test Accuracy: {accuracy:.2%}")
|
750 |
+
""")
|
751 |
+
|
752 |
+
with st.expander("Exercise 2: Making Predictions for New Data"):
|
753 |
+
st.write("""
|
754 |
+
1. Create a function to make predictions for new market conditions
|
755 |
+
2. Test the model with specific Lag1 and Lag2 values
|
756 |
+
3. Interpret the predicted probabilities
|
757 |
+
4. Discuss the model's limitations
|
758 |
+
""")
|
759 |
+
|
760 |
+
st.code("""
|
761 |
+
# Solution
|
762 |
+
def predict_market_direction(lag1, lag2):
|
763 |
+
newdata = pd.DataFrame({'Lag1': [lag1], 'Lag2': [lag2]})
|
764 |
+
newX = model.transform(newdata)
|
765 |
+
prob = results.predict(newX)[0]
|
766 |
+
return prob
|
767 |
+
|
768 |
+
# Example predictions
|
769 |
+
prob1 = predict_market_direction(1.2, 1.1)
|
770 |
+
prob2 = predict_market_direction(1.5, -0.8)
|
771 |
+
|
772 |
+
print(f"Probability of market going up for Lag1=1.2, Lag2=1.1: {prob1:.2%}")
|
773 |
+
print(f"Probability of market going up for Lag1=1.5, Lag2=-0.8: {prob2:.2%}")
|
774 |
+
""")
|
775 |
+
|
776 |
+
# Weekly Assignment
|
777 |
+
username = st.session_state.get("username", "Student")
|
778 |
+
st.header(f"{username}'s Weekly Assignment")
|
779 |
+
|
780 |
+
if username == "manxiii":
|
781 |
+
st.markdown("""
|
782 |
+
Hello **manxiii**, here is your Assignment 6: Stock Market Prediction with Logistic Regression.
|
783 |
+
1. Implement a logistic regression model using Lag1 and Lag2
|
784 |
+
2. Compare its performance with the full model
|
785 |
+
3. Analyze the coefficients and their significance
|
786 |
+
4. Create visualizations to support your findings
|
787 |
+
5. Write a brief report on why stock market prediction is challenging
|
788 |
+
|
789 |
+
**Due Date:** End of Week 6
|
790 |
+
""")
|
791 |
+
elif username == "zhu":
|
792 |
+
st.markdown("""
|
793 |
+
Hello **zhu**, here is your Assignment 6: Stock Market Prediction with Logistic Regression.
|
794 |
+
""")
|
795 |
+
elif username == "WK":
|
796 |
+
st.markdown("""
|
797 |
+
Hello **WK**, here is your Assignment 6: Stock Market Prediction with Logistic Regression.
|
798 |
+
""")
|
799 |
+
else:
|
800 |
+
st.markdown(f"""
|
801 |
+
Hello **{username}**, here is your Assignment 6: Stock Market Prediction with Logistic Regression.
|
802 |
+
Please contact the instructor for your specific assignment.
|
803 |
+
""")
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ matplotlib==3.8.3
|
|
6 |
seaborn==0.13.2
|
7 |
plotly==5.18.0
|
8 |
nltk==3.8.1
|
9 |
-
wordcloud==1.9.3
|
|
|
|
6 |
seaborn==0.13.2
|
7 |
plotly==5.18.0
|
8 |
nltk==3.8.1
|
9 |
+
wordcloud==1.9.3
|
10 |
+
ISLP
|