Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from sklearn.model_selection import train_test_split | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix | |
from sklearn.preprocessing import StandardScaler | |
import plotly.express as px | |
import plotly.graph_objects as go | |
from plotly.subplots import make_subplots | |
import scipy.stats as stats | |
from pathlib import Path | |
import statsmodels.api as sm | |
from ISLP import load_data | |
from ISLP.models import ModelSpec as MS, summarize | |
# Set up the style for all plots | |
plt.style.use('default') | |
sns.set_theme(style="whitegrid", palette="husl") | |
def load_smarket_data(): | |
"""Load and prepare the Smarket data""" | |
try: | |
Smarket = load_data('Smarket') | |
return Smarket | |
except Exception as e: | |
st.error(f"Error loading Smarket data: {str(e)}") | |
return None | |
def create_confusion_matrix_plot(y_true, y_pred, title="Confusion Matrix"): | |
"""Create an interactive confusion matrix plot""" | |
cm = confusion_matrix(y_true, y_pred) | |
fig = go.Figure(data=go.Heatmap( | |
z=cm, | |
x=['Predicted Down', 'Predicted Up'], | |
y=['Actual Down', 'Actual Up'], | |
colorscale='RdBu', | |
text=[[str(val) for val in row] for row in cm], | |
texttemplate='%{text}', | |
textfont={"size": 16} | |
)) | |
fig.update_layout( | |
title=title, | |
title_x=0.5, | |
title_font_size=20, | |
plot_bgcolor='rgb(30, 30, 30)', | |
paper_bgcolor='rgb(30, 30, 30)', | |
font=dict(color='white') | |
) | |
return fig | |
def create_correlation_heatmap(df): | |
"""Create a correlation heatmap using plotly""" | |
corr = df.corr(numeric_only=True) | |
fig = go.Figure(data=go.Heatmap( | |
z=corr, | |
x=corr.columns, | |
y=corr.columns, | |
colorscale='RdBu', | |
zmin=-1, zmax=1, | |
text=[[f'{val:.2f}' for val in row] for row in corr.values], | |
texttemplate='%{text}', | |
textfont={"size": 12} | |
)) | |
fig.update_layout( | |
title='S&P 500 Returns Correlation Heatmap', | |
title_x=0.5, | |
title_font_size=20, | |
plot_bgcolor='rgb(30, 30, 30)', | |
paper_bgcolor='rgb(30, 30, 30)', | |
font=dict(color='white') | |
) | |
return fig | |
def create_decision_boundary_plot(X, y, model): | |
"""Create an interactive decision boundary plot using plotly""" | |
# Create a mesh grid | |
x_min, x_max = X['Lag1'].min() - 1, X['Lag1'].max() + 1 | |
y_min, y_max = X['Lag2'].min() - 1, X['Lag2'].max() + 1 | |
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), | |
np.arange(y_min, y_max, 0.01)) | |
# Get predictions for the mesh grid | |
Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) | |
Z = Z.reshape(xx.shape) | |
# Create the plot | |
fig = go.Figure() | |
# Add the decision boundary | |
fig.add_trace(go.Contour( | |
x=np.arange(x_min, x_max, 0.01), | |
y=np.arange(y_min, y_max, 0.01), | |
z=Z, | |
colorscale='RdBu', | |
showscale=False, | |
opacity=0.5 | |
)) | |
# Add the scatter points | |
fig.add_trace(go.Scatter( | |
x=X['Lag1'], | |
y=X['Lag2'], | |
mode='markers', | |
marker=dict( | |
color=y, | |
colorscale='RdBu', | |
size=8, | |
line=dict(color='black', width=1) | |
), | |
name='Data Points' | |
)) | |
# Update layout | |
fig.update_layout( | |
title='Logistic Regression Decision Boundary', | |
xaxis_title='Lag1', | |
yaxis_title='Lag2', | |
plot_bgcolor='rgb(30, 30, 30)', | |
paper_bgcolor='rgb(30, 30, 30)', | |
font=dict(color='white'), | |
showlegend=False | |
) | |
return fig | |
def show(): | |
st.title("Week 6: Logistic Regression and Stock Market Prediction") | |
# Introduction Section | |
st.header("Course Overview") | |
st.write(""" | |
In this week, we'll use logistic regression to try predicting whether the stock market goes up or down. | |
This is intentionally a challenging prediction problem that will teach us important lessons about: | |
- When logistic regression works well and when it doesn't | |
- How to interpret probabilities and coefficients | |
- Why some prediction problems are inherently difficult | |
- Proper model evaluation techniques | |
""") | |
# Learning Path | |
st.subheader("Learning Path") | |
st.write(""" | |
1. Understanding the Stock Market Data: S&P 500 returns and predictors | |
2. Logistic Regression Fundamentals: From linear to logistic | |
3. Model Training and Evaluation: Proper train-test splitting | |
4. Interpreting Results: Coefficients and probabilities | |
5. Model Assessment: Confusion matrices and metrics | |
6. Real-world Applications: Challenges and limitations | |
""") | |
# Module 1: Understanding the Data | |
st.header("Module 1: Understanding the Stock Market Data") | |
st.write(""" | |
We'll examine the Smarket data, which consists of percentage returns for the S&P 500 stock index over 1,250 days, | |
from the beginning of 2001 until the end of 2005. For each date, we have: | |
- Percentage returns for each of the five previous trading days (Lag1 through Lag5) | |
- Volume (number of shares traded on the previous day, in billions) | |
- Today (percentage return on the date in question) | |
- Direction (whether the market was Up or Down on this date) | |
""") | |
# Load and display data | |
Smarket = load_smarket_data() | |
if Smarket is not None: | |
st.write("First few rows of the Smarket data:") | |
st.dataframe(Smarket.head()) | |
# EDA Plots | |
st.subheader("Exploratory Data Analysis") | |
# Volume over time | |
st.write("**Trading Volume Over Time**") | |
fig_volume = go.Figure() | |
fig_volume.add_trace(go.Scatter( | |
x=Smarket.index, | |
y=Smarket['Volume'], | |
mode='lines', | |
name='Volume' | |
)) | |
fig_volume.update_layout( | |
title='Trading Volume Over Time', | |
xaxis_title='Time', | |
yaxis_title='Volume (billions of shares)', | |
plot_bgcolor='rgb(30, 30, 30)', | |
paper_bgcolor='rgb(30, 30, 30)', | |
font=dict(color='white') | |
) | |
st.plotly_chart(fig_volume) | |
# Returns distribution | |
st.write("**Distribution of Returns**") | |
# Add column selection | |
selected_columns = st.multiselect( | |
"Select columns to display", | |
options=['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Today'], | |
default=['Lag1', 'Lag2'] | |
) | |
if selected_columns: | |
fig_returns = go.Figure() | |
for col in selected_columns: | |
fig_returns.add_trace(go.Histogram( | |
x=Smarket[col], | |
name=col, | |
opacity=0.7, | |
nbinsx=50 # Adjust number of bins for better visualization | |
)) | |
# Add mean and std lines | |
for col in selected_columns: | |
mean_val = Smarket[col].mean() | |
std_val = Smarket[col].std() | |
fig_returns.add_vline( | |
x=mean_val, | |
line_dash="dash", | |
line_color="red", | |
annotation_text=f"{col} Mean: {mean_val:.2f}%", | |
annotation_position="top right", | |
annotation=dict( | |
textangle=-45, | |
font=dict(size=10) | |
) | |
) | |
fig_returns.add_vline( | |
x=mean_val + std_val, | |
line_dash="dot", | |
line_color="yellow", | |
annotation_text=f"{col} +1σ: {mean_val + std_val:.2f}%", | |
annotation_position="top right", | |
annotation=dict( | |
textangle=-45, | |
font=dict(size=10) | |
) | |
) | |
fig_returns.add_vline( | |
x=mean_val - std_val, | |
line_dash="dot", | |
line_color="yellow", | |
annotation_text=f"{col} -1σ: {mean_val - std_val:.2f}%", | |
annotation_position="top right", | |
annotation=dict( | |
textangle=-45, | |
font=dict(size=10) | |
) | |
) | |
fig_returns.update_layout( | |
title='Distribution of Returns', | |
xaxis_title='Return (%)', | |
yaxis_title='Frequency', | |
barmode='overlay', | |
plot_bgcolor='rgb(30, 30, 30)', | |
paper_bgcolor='rgb(30, 30, 30)', | |
font=dict(color='white'), | |
showlegend=True, | |
legend=dict( | |
yanchor="top", | |
y=0.99, | |
xanchor="left", | |
x=0.01 | |
) | |
) | |
# Add summary statistics | |
st.write("**Summary Statistics**") | |
summary_stats = Smarket[selected_columns].describe() | |
st.dataframe(summary_stats.style.format('{:.2f}')) | |
st.plotly_chart(fig_returns) | |
# Add interpretation | |
st.write(""" | |
**Interpretation:** | |
- The dashed red line shows the mean return for each selected period | |
- The dotted yellow lines show one standard deviation above and below the mean | |
- The overlap of distributions helps identify similarities in return patterns | |
- Wider distributions indicate higher volatility | |
""") | |
# Returns over time | |
st.write("**Returns Over Time**") | |
fig_returns_time = go.Figure() | |
fig_returns_time.add_trace(go.Scatter( | |
x=Smarket.index, | |
y=Smarket['Today'], | |
mode='lines', | |
name='Today\'s Return' | |
)) | |
fig_returns_time.update_layout( | |
title='Daily Returns Over Time', | |
xaxis_title='Time', | |
yaxis_title='Return (%)', | |
plot_bgcolor='rgb(30, 30, 30)', | |
paper_bgcolor='rgb(30, 30, 30)', | |
font=dict(color='white') | |
) | |
st.plotly_chart(fig_returns_time) | |
# Direction distribution | |
st.write("**Market Direction Distribution**") | |
direction_counts = Smarket['Direction'].value_counts() | |
fig_direction = go.Figure(data=[go.Pie( | |
labels=direction_counts.index, | |
values=direction_counts.values, | |
hole=.3 | |
)]) | |
fig_direction.update_layout( | |
title='Distribution of Market Direction', | |
plot_bgcolor='rgb(30, 30, 30)', | |
paper_bgcolor='rgb(30, 30, 30)', | |
font=dict(color='white') | |
) | |
st.plotly_chart(fig_direction) | |
# Show correlation heatmap | |
st.write("**Correlation Analysis**") | |
st.plotly_chart(create_correlation_heatmap(Smarket)) | |
st.write(""" | |
Key observations from the exploratory analysis: | |
1. **Trading Volume**: | |
- Shows an increasing trend over time | |
- Higher volatility in recent years | |
- Some periods of unusually high volume | |
2. **Returns Distribution**: | |
- Approximately normal distribution | |
- Most returns are close to zero | |
- Some extreme values (outliers) | |
3. **Market Direction**: | |
- Relatively balanced between Up and Down days | |
- Slight bias towards Up days | |
4. **Correlations**: | |
- Low correlation between lagged returns | |
- Strong correlation between Year and Volume | |
- Today's return shows little correlation with past returns | |
""") | |
# Module 2: Logistic Regression Implementation | |
st.header("Module 2: Logistic Regression Implementation") | |
st.write(""" | |
We'll fit a logistic regression model to predict Direction using Lag1 through Lag5 and Volume. | |
The model will help us understand if we can predict market movements based on recent trading patterns. | |
""") | |
if Smarket is not None: | |
# Prepare data for logistic regression | |
allvars = Smarket.columns.drop(['Today', 'Direction', 'Year']) | |
design = MS(allvars) | |
X = design.fit_transform(Smarket) | |
y = Smarket.Direction == 'Up' | |
# Fit the model | |
glm = sm.GLM(y, X, family=sm.families.Binomial()) | |
results = glm.fit() | |
# Display model summary | |
st.write("Model Summary:") | |
st.write(summarize(results)) | |
# Show coefficients | |
st.write("Model Coefficients:") | |
coef_df = pd.DataFrame({ | |
'Feature': allvars, | |
'Coefficient': results.params[1:], # Skip the intercept | |
'P-value': results.pvalues[1:] # Skip the intercept | |
}) | |
st.write(coef_df) | |
# Module 3: Model Evaluation | |
st.header("Module 3: Model Evaluation") | |
st.write(""" | |
We'll evaluate our model using proper train-test splitting, focusing on predicting 2005 data using models trained on 2001-2004 data. | |
This gives us a more realistic assessment of model performance. | |
""") | |
if Smarket is not None: | |
# Split data by year | |
train = (Smarket.Year < 2005) | |
X_train, X_test = X.loc[train], X.loc[~train] | |
y_train, y_test = y.loc[train], y.loc[~train] | |
# Fit model on training data | |
glm_train = sm.GLM(y_train, X_train, family=sm.families.Binomial()) | |
results = glm_train.fit() | |
# Make predictions | |
probs = results.predict(exog=X_test) | |
labels = np.array(['Down']*len(probs)) | |
labels[probs>0.5] = 'Up' | |
# Show confusion matrix | |
st.plotly_chart(create_confusion_matrix_plot(Smarket.Direction[~train], labels)) | |
# Calculate and display accuracy | |
accuracy = np.mean(labels == Smarket.Direction[~train]) | |
st.write(f"Test Accuracy: {accuracy:.2%}") | |
# Module 4: Decision Boundary Visualization | |
st.header("Module 4: Decision Boundary Visualization") | |
st.write(""" | |
Let's visualize how our logistic regression model separates the market movements using Lag1 and Lag2 as predictors. | |
The decision boundary shows how the model classifies different combinations of previous day returns. | |
""") | |
if Smarket is not None: | |
# Prepare data for decision boundary plot | |
X_plot = Smarket[['Lag1', 'Lag2']] | |
y_plot = (Smarket['Direction'] == 'Up').astype(int) | |
# Fit a simple logistic regression model for visualization | |
log_reg = LogisticRegression() | |
log_reg.fit(X_plot, y_plot) | |
# Create and display the decision boundary plot | |
st.plotly_chart(create_decision_boundary_plot(X_plot, y_plot, log_reg)) | |
st.write(""" | |
The decision boundary plot shows: | |
- Blue regions indicate where the model predicts the market will go down | |
- Red regions indicate where the model predicts the market will go up | |
- The boundary between these regions represents where the model is uncertain | |
- The scatter points show actual market movements, colored by their true direction | |
""") | |
# Module 5: Interpreting Logistic Regression Results | |
st.header("Module 5: Interpreting Logistic Regression Results") | |
st.subheader("Understanding the Coefficients") | |
st.write(""" | |
In logistic regression, coefficients tell us about the relationship between predictors and the probability of the outcome. | |
Let's break down how to interpret them: | |
1. **Coefficient Sign**: | |
- Positive coefficients increase the probability of the outcome (market going up) | |
- Negative coefficients decrease the probability of the outcome (market going down) | |
2. **Coefficient Magnitude**: | |
- Larger absolute values indicate stronger effects | |
- The effect is non-linear due to the logistic function | |
""") | |
# Add visualization comparing linear and logistic regression | |
st.write("**Linear vs Logistic Regression**") | |
# Create sample data | |
x = np.linspace(-5, 5, 100) | |
y_linear = 0.5 * x + 0.5 # Linear regression | |
y_logistic = 1 / (1 + np.exp(-(2 * x))) # Logistic regression with steeper slope | |
# Create the comparison plot | |
fig_comparison = go.Figure() | |
# Add linear regression line | |
fig_comparison.add_trace(go.Scatter( | |
x=x, | |
y=y_linear, | |
mode='lines', | |
name='Linear Regression', | |
line=dict(color='blue', width=2) | |
)) | |
# Add logistic regression curve | |
fig_comparison.add_trace(go.Scatter( | |
x=x, | |
y=y_logistic, | |
mode='lines', | |
name='Logistic Regression', | |
line=dict(color='red', width=2) | |
)) | |
# Add some sample points with more extreme separation | |
np.random.seed(42) | |
x_samples = np.random.normal(0, 1, 50) | |
# Make the separation more clear | |
y_samples = (x_samples > 0.5).astype(int) # Changed threshold to 0.5 for clearer separation | |
fig_comparison.add_trace(go.Scatter( | |
x=x_samples, | |
y=y_samples, | |
mode='markers', | |
name='Sample Data', | |
marker=dict( | |
color=['red' if y == 0 else 'green' for y in y_samples], | |
size=8, | |
symbol='circle' | |
) | |
)) | |
# Update layout | |
fig_comparison.update_layout( | |
title='Linear vs Logistic Regression', | |
xaxis_title='Input Feature (X)', | |
yaxis_title='Output', | |
plot_bgcolor='rgb(30, 30, 30)', | |
paper_bgcolor='rgb(30, 30, 30)', | |
font=dict(color='white'), | |
showlegend=True, | |
legend=dict( | |
yanchor="top", | |
y=0.99, | |
xanchor="left", | |
x=0.01 | |
), | |
yaxis=dict( | |
range=[-0.1, 1.1] # Extend y-axis range slightly | |
) | |
) | |
# Add annotations | |
fig_comparison.add_annotation( | |
x=2, y=0.8, | |
text="Linear Regression<br>predicts continuous values", | |
showarrow=True, | |
arrowhead=1, | |
ax=50, ay=-30, | |
font=dict(color='white', size=10) | |
) | |
fig_comparison.add_annotation( | |
x=2, y=0.3, | |
text="Logistic Regression<br>predicts probabilities<br>(S-shaped curve)", | |
showarrow=True, | |
arrowhead=1, | |
ax=50, ay=30, | |
font=dict(color='white', size=10) | |
) | |
# Add decision boundary annotation | |
fig_comparison.add_annotation( | |
x=0, y=0.5, | |
text="Decision Boundary<br>(p = 0.5)", | |
showarrow=True, | |
arrowhead=1, | |
ax=0, ay=-40, | |
font=dict(color='white', size=10) | |
) | |
st.plotly_chart(fig_comparison) | |
st.write(""" | |
**Key Differences:** | |
1. **Output Range**: | |
- Linear Regression: Can predict any value (-∞ to +∞) | |
- Logistic Regression: Predicts probabilities (0 to 1) | |
2. **Function Shape**: | |
- Linear Regression: Straight line | |
- Logistic Regression: S-shaped curve (sigmoid) | |
- The sigmoid function creates a sharp transition around the decision boundary | |
3. **Use Case**: | |
- Linear Regression: Predicting continuous values | |
- Logistic Regression: Predicting binary outcomes (Up/Down) | |
4. **Interpretation**: | |
- Linear Regression: Direct relationship between X and Y | |
- Logistic Regression: Non-linear relationship between X and probability of Y | |
- Small changes in X can lead to large changes in probability near the decision boundary | |
""") | |
if Smarket is not None: | |
# Calculate and display coefficients | |
st.subheader("Example: Interpreting Our Model's Coefficients") | |
# Get coefficients from the model | |
coef_results = pd.DataFrame({ | |
'Feature': allvars, | |
'Coefficient': results.params[1:], | |
'P-value': results.pvalues[1:] | |
}) | |
st.write("Coefficient Analysis:") | |
st.dataframe(coef_results.style.format({ | |
'Coefficient': '{:.4f}', | |
'P-value': '{:.4f}' | |
})) | |
st.write(""" | |
Let's interpret some examples from our model: | |
1. **Lag1 Coefficient**: | |
- A positive coefficient means that higher values of Lag1 are associated with higher probability of the market going up | |
- The magnitude tells us how strong this relationship is | |
2. **Volume Coefficient**: | |
- A positive coefficient suggests that higher trading volume is associated with higher probability of upward market movement | |
- The size of the coefficient indicates the strength of this relationship | |
""") | |
st.subheader("Understanding Model Performance") | |
st.write(""" | |
Our model's performance metrics tell us important information: | |
1. **Accuracy**: | |
- The proportion of correct predictions | |
- In our case, around 52% accuracy on the test set | |
- This is slightly better than random guessing (50%) | |
2. **Confusion Matrix**: | |
The confusion matrix is a 2x2 table that shows: | |
- **True Positives (TP)**: | |
- Correctly predicted market going up | |
- These are the cases where we predicted 'Up' and the market actually went up | |
- **False Positives (FP)**: | |
- Incorrectly predicted market going up | |
- These are the cases where we predicted 'Up' but the market actually went down | |
- Also known as Type I errors | |
- **True Negatives (TN)**: | |
- Correctly predicted market going down | |
- These are the cases where we predicted 'Down' and the market actually went down | |
- **False Negatives (FN)**: | |
- Incorrectly predicted market going down | |
- These are the cases where we predicted 'Down' but the market actually went up | |
- Also known as Type II errors | |
From these values, we can calculate important metrics: | |
- **Precision** = TP / (TP + FP): How many of our 'Up' predictions were correct | |
- **Recall** = TP / (TP + FN): How many of the actual 'Up' days did we catch | |
- **F1 Score** = 2 * (Precision * Recall) / (Precision + Recall): Balanced measure of precision and recall | |
- **Accuracy** = (TP + TN) / (TP + TN + FP + FN): Overall correct predictions | |
3. **P-values**: | |
- Indicate statistical significance of each predictor | |
- P-value < 0.05 suggests the predictor is significant | |
- In our case, most predictors are not statistically significant | |
""") | |
st.subheader("Practical Implications") | |
st.write(""" | |
What does this mean for real-world trading? | |
1. **Model Limitations**: | |
- The model's accuracy is only slightly better than random guessing | |
- This suggests that predicting market direction is inherently difficult | |
- Past returns alone are not reliable predictors | |
2. **Risk Management**: | |
- Even with a model, trading decisions should include: | |
- Stop-loss orders | |
- Position sizing | |
- Diversification | |
- Risk tolerance considerations | |
3. **Model Improvement**: | |
- Consider adding more features: | |
- Technical indicators | |
- Market sentiment | |
- Economic indicators | |
- Use more sophisticated models: | |
- Ensemble methods | |
- Deep learning | |
- Time series models | |
""") | |
st.subheader("Example: Making a Prediction") | |
st.write(""" | |
Let's walk through an example of making a prediction: | |
1. **Input Data**: | |
- Lag1 = 1.2% (yesterday's return) | |
- Lag2 = -0.8% (day before yesterday's return) | |
- Volume = 1.1 billion shares | |
2. **Calculate Probability**: | |
- Use the logistic function: P(Y=1) = 1 / (1 + e^(-z)) | |
- where z = β₀ + β₁(Lag1) + β₂(Lag2) + ... + β₆(Volume) | |
3. **Interpret Result**: | |
- If P(Y=1) > 0.5, predict market will go up | |
- If P(Y=1) < 0.5, predict market will go down | |
- The probability itself tells us about confidence | |
""") | |
if Smarket is not None: | |
# Example prediction | |
st.write("**Interactive Example:**") | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
lag1 = st.number_input("Lag1 (%)", value=1.2, step=0.1) | |
with col2: | |
lag2 = st.number_input("Lag2 (%)", value=-0.8, step=0.1) | |
with col3: | |
volume = st.number_input("Volume (billions)", value=1.1, step=0.1) | |
# Make prediction | |
X_example = pd.DataFrame({ | |
'Lag1': [lag1], | |
'Lag2': [lag2], | |
'Lag3': [0], | |
'Lag4': [0], | |
'Lag5': [0], | |
'Volume': [volume] | |
}) | |
# Transform using the same design matrix | |
X_example = design.transform(X_example) | |
prob = results.predict(X_example)[0] | |
st.write(f""" | |
**Prediction Results:** | |
- Probability of market going up: {prob:.2%} | |
- Predicted direction: {'Up' if prob > 0.5 else 'Down'} | |
- Confidence level: {abs(prob - 0.5)*2:.2%} | |
""") | |
# Practice Exercises | |
st.header("Practice Exercises") | |
with st.expander("Exercise 1: Implementing Logistic Regression with Lag1 and Lag2"): | |
st.write(""" | |
1. Implement a logistic regression model using only Lag1 and Lag2 | |
2. Compare its performance with the full model | |
3. Analyze the coefficients and their significance | |
4. Visualize the results | |
""") | |
st.code(""" | |
# Solution | |
model = MS(['Lag1', 'Lag2']).fit(Smarket) | |
X = model.transform(Smarket) | |
X_train, X_test = X.loc[train], X.loc[~train] | |
glm_train = sm.GLM(y_train, X_train, family=sm.families.Binomial()) | |
results = glm_train.fit() | |
probs = results.predict(exog=X_test) | |
labels = np.array(['Down']*len(probs)) | |
labels[probs>0.5] = 'Up' | |
# Evaluate performance | |
accuracy = np.mean(labels == Smarket.Direction[~train]) | |
print(f"Test Accuracy: {accuracy:.2%}") | |
""") | |
with st.expander("Exercise 2: Making Predictions for New Data"): | |
st.write(""" | |
1. Create a function to make predictions for new market conditions | |
2. Test the model with specific Lag1 and Lag2 values | |
3. Interpret the predicted probabilities | |
4. Discuss the model's limitations | |
""") | |
st.code(""" | |
# Solution | |
def predict_market_direction(lag1, lag2): | |
newdata = pd.DataFrame({'Lag1': [lag1], 'Lag2': [lag2]}) | |
newX = model.transform(newdata) | |
prob = results.predict(newX)[0] | |
return prob | |
# Example predictions | |
prob1 = predict_market_direction(1.2, 1.1) | |
prob2 = predict_market_direction(1.5, -0.8) | |
print(f"Probability of market going up for Lag1=1.2, Lag2=1.1: {prob1:.2%}") | |
print(f"Probability of market going up for Lag1=1.5, Lag2=-0.8: {prob2:.2%}") | |
""") | |
# Weekly Assignment | |
username = st.session_state.get("username", "Student") | |
st.header(f"{username}'s Weekly Assignment") | |
if username == "manxiii": | |
st.markdown(""" | |
Hello **manxiii**, here is your Assignment 6: Stock Market Prediction with Logistic Regression. | |
1. Implement a logistic regression model using Lag1 and Lag2 | |
2. Compare its performance with the full model | |
3. Analyze the coefficients and their significance | |
4. Create visualizations to support your findings | |
5. Write a brief report on why stock market prediction is challenging | |
**Due Date:** End of Week 6 | |
""") | |
elif username == "zhu": | |
st.markdown(""" | |
Hello **zhu**, here is your Assignment 6: Stock Market Prediction with Logistic Regression. | |
""") | |
elif username == "WK": | |
st.markdown(""" | |
Hello **WK**, here is your Assignment 6: Stock Market Prediction with Logistic Regression. | |
""") | |
else: | |
st.markdown(f""" | |
Hello **{username}**, here is your Assignment 6: Stock Market Prediction with Logistic Regression. | |
Please contact the instructor for your specific assignment. | |
""") |