raymondEDS
fixing navigation
1ecc668
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from statsmodels.datasets import get_rdataset
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
# Set up the style for all plots
plt.style.use('default')
sns.set_theme(style="whitegrid", palette="husl")
def load_arrests_data():
"""Load and return the US Arrests dataset"""
USArrests = get_rdataset('USArrests').data
return USArrests
def create_categorical_plot(df, column, target='Survived'):
"""Create an interactive plot for categorical variables"""
fig = px.bar(
df.groupby(column)[target].mean().reset_index(),
x=column,
y=target,
title=f'Survival Rate by {column}',
labels={target: 'Survival Rate', column: column},
color=target,
color_continuous_scale='RdBu'
)
fig.update_layout(
plot_bgcolor='rgb(30, 30, 30)',
paper_bgcolor='rgb(30, 30, 30)',
font=dict(color='white')
)
return fig
def create_numeric_plot(df, column, target='Survived'):
"""Create an interactive plot for numeric variables"""
fig = px.box(
df,
x=target,
y=column,
title=f'{column} Distribution by Survival',
labels={target: 'Survived', column: column},
color=target,
color_discrete_sequence=px.colors.qualitative.Set1
)
fig.update_layout(
plot_bgcolor='rgb(30, 30, 30)',
paper_bgcolor='rgb(30, 30, 30)',
font=dict(color='white')
)
return fig
def show():
st.title("Week 7: Clustering Lab - State Crime Pattern Analysis")
# Code Example: Loading and Basic Data Exploration
with st.expander("Code Example: Loading and Exploring Data"):
st.code("""
# Load the data
from statsmodels.datasets import get_rdataset
USArrests = get_rdataset('USArrests').data
# Basic data exploration
print("Dataset shape:", USArrests.shape)
print("\\nVariables:", USArrests.columns.tolist())
print("\\nFirst 5 states:")
print(USArrests.head())
# Basic statistics
print("\\nData Summary:")
print(USArrests.describe())
""", language="python")
# Introduction Section with Learning Objectives
st.header("Learning Objectives")
st.markdown("""
In this week, you'll master:
1. **Unsupervised Learning**: Discover hidden patterns in crime data without predefined categories
2. **K-Means Clustering**: Learn to divide states into distinct safety profiles
3. **Hierarchical Clustering**: Create a "family tree" of state crime patterns
4. **Data Preprocessing**: Understand why scaling is crucial for fair comparisons
""")
# Interactive Overview
st.header("Lab Overview")
st.write("""
Welcome to your hands-on clustering lab! You'll be working as a policy analyst for the Department of Justice,
analyzing crime patterns across US states. Your mission: discover hidden safety profiles that could inform
federal resource allocation and crime prevention strategies.
""")
# Load Data
st.header("Exercise 1: Data Detective Work")
st.write("Let's start by understanding our dataset - the US Arrests data.")
df = load_arrests_data()
# Code Example: Data Visualization
with st.expander("Code Example: Creating Visualizations"):
st.code("""
# Create correlation heatmap
import plotly.express as px
fig = px.imshow(df.corr(),
labels=dict(color="Correlation"),
color_continuous_scale="RdBu")
fig.show()
# Create box plots
fig = px.box(df, title="Data Distribution")
fig.show()
""", language="python")
# Interactive Data Exploration
col1, col2 = st.columns(2)
with col1:
st.subheader("Dataset Overview")
st.write(f"Number of states: {len(df)}")
st.write(f"Number of variables: {len(df.columns)}")
st.write("\nVariables:", df.columns.tolist())
# Interactive data summary
st.subheader("Data Summary")
summary = df.describe()
st.dataframe(summary)
with col2:
st.subheader("First 5 States")
st.dataframe(df.head())
# Interactive correlation heatmap
st.subheader("Correlation Heatmap")
fig = px.imshow(df.corr(),
labels=dict(color="Correlation"),
color_continuous_scale="RdBu")
st.plotly_chart(fig)
# Exercise 2: Scaling Challenge
st.header("Exercise 2: The Scaling Challenge")
# Code Example: Data Scaling
with st.expander("Code Example: Scaling Data"):
st.code("""
# Import StandardScaler
from sklearn.preprocessing import StandardScaler
# Create and fit the scaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
# Convert back to DataFrame
df_scaled = pd.DataFrame(df_scaled,
columns=df.columns,
index=df.index)
# Compare original vs scaled data
print("Original data ranges:")
print(df.describe())
print("\\nScaled data ranges:")
print(df_scaled.describe())
""", language="python")
# Explanation of scaling
st.markdown("""
### Why Do We Need Scaling?
In our crime data, we have variables measured in very different scales:
- Murder rates: typically 0-20 per 100,000
- Assault rates: typically 50-350 per 100,000
- Urban population: 0-100 percentage
- Rape rates: typically 0-50 per 100,000
Without scaling, variables with larger numbers (like Assault) would dominate our analysis,
making smaller-scale variables (like Murder) less influential. This would be like comparing
dollars to cents - the cents would seem insignificant even if they were important!
""")
# Show original data ranges
st.subheader("Original Data Ranges")
col1, col2 = st.columns(2)
with col1:
# Create a bar chart of variances
fig_var = px.bar(
x=df.columns,
y=df.var(),
title="Variance of Each Variable (Before Scaling)",
labels={'x': 'Crime Variables', 'y': 'Variance'},
color=df.var(),
color_continuous_scale='Viridis'
)
st.plotly_chart(fig_var)
st.write("""
Notice how Assault has a much larger variance (6,945) compared to Murder (19).
This means Assault would dominate our clustering if we didn't scale the data!
""")
with col2:
# Create box plots of original data
fig_box = px.box(df, title="Original Data Distribution")
fig_box.update_layout(
xaxis_title="Crime Variables",
yaxis_title="Rate per 100,000"
)
st.plotly_chart(fig_box)
# Explain standardization
st.markdown("""
### What is Standardization?
Standardization (also called Z-score normalization) transforms our data so that:
1. Each variable has a mean of 0
2. Each variable has a standard deviation of 1
The formula is: z = (x - μ) / σ
- x is the original value
- μ is the mean of the variable
- σ is the standard deviation of the variable
""")
# Scale the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)
# Show scaled data
st.subheader("After Scaling")
# Create box plots of scaled data
fig_scaled = px.box(df_scaled, title="Scaled Data Distribution")
fig_scaled.update_layout(
xaxis_title="Crime Variables",
yaxis_title="Standardized Values"
)
st.plotly_chart(fig_scaled)
st.write("""
After scaling, all variables are on the same scale:
- Mean = 0
- Standard Deviation = 1
- Values typically range from -3 to +3
""")
# Show before/after comparison for a few states
st.write("### Before vs After Scaling (Sample States)")
comparison_df = pd.DataFrame({
'State': df.index[:5],
'Original Murder': df['Murder'][:5],
'Scaled Murder': df_scaled['Murder'][:5],
'Original Assault': df['Assault'][:5],
'Scaled Assault': df_scaled['Assault'][:5]
})
st.dataframe(comparison_df)
st.write("""
Notice how the relative differences between states are preserved,
but now all variables contribute equally to our analysis!
""")
# Why scaling matters for clustering
st.markdown("""
### Why Scaling Matters for Clustering
In clustering, we measure distances between data points. Without scaling:
- States might be grouped together just because they have similar assault rates
- Important differences in murder rates might be ignored
With scaling:
- All variables contribute equally to the distance calculations
- We can find true patterns in the data, not just patterns in the largest numbers
""")
# Exercise 3: Finding Optimal Clusters
st.header("Exercise 3: Finding the Right Number of Groups")
# Code Example: Elbow Method
with st.expander("Code Example: Finding Optimal K"):
st.code("""
# Calculate inertias for different K values
inertias = []
K_values = range(1, 11)
for k in K_values:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
kmeans.fit(df_scaled)
inertias.append(kmeans.inertia_)
# Create elbow plot
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(
x=list(K_values),
y=inertias,
mode='lines+markers',
name='Inertia'
))
fig.update_layout(
title='Finding the Optimal Number of Clusters',
xaxis_title='Number of Clusters (K)',
yaxis_title='Within-Cluster Sum of Squares'
)
fig.show()
""", language="python")
st.markdown("""
### The Elbow Method Explained
The elbow method helps us find the optimal number of clusters (K) by looking at how the "within-cluster sum of squares"
(WCSS) changes as we increase the number of clusters. Think of it like this:
- **What is WCSS?** It's a measure of how spread out the points are within each cluster
- **Lower WCSS** means points are closer to their cluster center (better clustering)
- **Higher WCSS** means points are more spread out from their cluster center
As we increase K:
1. WCSS always decreases (more clusters = tighter groups)
2. The rate of decrease slows down
3. We look for the "elbow" - where adding more clusters doesn't help much anymore
""")
# Calculate inertias for different K values
inertias = []
K_values = range(1, 11)
for k in K_values:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
kmeans.fit(df_scaled)
inertias.append(kmeans.inertia_)
# Create interactive elbow plot
fig_elbow = go.Figure()
fig_elbow.add_trace(go.Scatter(
x=list(K_values),
y=inertias,
mode='lines+markers',
name='Inertia'
))
fig_elbow.update_layout(
title='Finding the Optimal Number of State Crime Profiles',
xaxis_title='Number of Clusters (K)',
yaxis_title='Within-Cluster Sum of Squares',
plot_bgcolor='rgb(30, 30, 30)',
paper_bgcolor='rgb(30, 30, 30)',
font=dict(color='white')
)
st.plotly_chart(fig_elbow)
# Interpretation guide
st.markdown("""
### How to Interpret the Elbow Plot
Look at the plot above and ask yourself:
1. **Where is the "elbow"?**
- The point where the line starts to level off
- Adding more clusters doesn't give much improvement
- In our case, it's around K=4
2. **What do the numbers mean?**
- K=1: All states in one group (not useful)
- K=2: Basic high/low crime split
- K=3: More nuanced grouping
- K=4: Our "elbow" - good balance of detail and simplicity
- K>4: Diminishing returns - more complexity without much benefit
3. **Why not just use more clusters?**
- More clusters = more complex to interpret
- Small clusters might not be meaningful
- Goal is to find the simplest model that captures the main patterns
""")
# Show the actual values
st.write("### WCSS Values for Each K")
wcss_df = pd.DataFrame({
'Number of Clusters (K)': K_values,
'Within-Cluster Sum of Squares': inertias,
'Improvement from Previous K': [0] + [inertias[i-1] - inertias[i] for i in range(1, len(inertias))]
})
st.dataframe(wcss_df)
st.markdown("""
### Making the Decision
Based on our elbow plot and the numbers above:
1. The biggest improvements happen from K=1 to K=4
2. After K=4, the improvements get much smaller
3. K=4 gives us a good balance of:
- Capturing meaningful patterns
- Keeping the model simple enough to interpret
- Having enough states in each cluster to be meaningful
This is why we'll use K=4 for our clustering analysis!
""")
# Exercise 4: K-Means Clustering
st.header("Exercise 4: K-Means State Profiling")
# Code Example: K-Means Clustering
with st.expander("Code Example: K-Means Implementation"):
st.code("""
# Perform K-means clustering
from sklearn.cluster import KMeans
# Create and fit the model
kmeans = KMeans(
n_clusters=4, # Number of clusters
random_state=42, # For reproducibility
n_init=20 # Number of times to run with different centroids
)
cluster_labels = kmeans.fit_predict(df_scaled)
# Add cluster labels to original data
df_clustered = df.copy()
df_clustered['Cluster'] = cluster_labels
# Visualize the clusters
import plotly.express as px
fig = px.scatter(df_clustered,
x='Murder',
y='Assault',
color='Cluster',
hover_data=['UrbanPop', 'Rape'],
title='State Crime Profiles')
fig.show()
# Show cluster centers
centers_df = pd.DataFrame(
kmeans.cluster_centers_,
columns=df.columns
)
print("Cluster Centers:")
print(centers_df)
""", language="python")
st.markdown("""
### What is K-Means Clustering?
K-means is an unsupervised learning algorithm that groups similar data points together. Think of it like organizing
students into study groups based on their interests:
1. **Initialization**:
- We randomly place K "centers" (centroids) in our data space
- Each center represents the "average" of its cluster
- In our case, each center represents a typical crime profile
2. **Assignment**:
- Each state is assigned to its nearest center
- "Nearest" is measured by Euclidean distance
- States with similar crime patterns end up in the same cluster
3. **Update**:
- Centers move to the average position of their assigned states
- This process repeats until centers stop moving
- The algorithm converges when states are optimally grouped
""")
# Visualize the process
st.subheader("K-Means in Action")
st.write("""
Let's see how K-means works with our state crime data. We'll use K=4 clusters to find distinct crime profiles.
""")
# Let user choose number of clusters
k = st.slider("Choose number of clusters (K)", 2, 6, 4)
# Perform K-means clustering
kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
cluster_labels = kmeans.fit_predict(df_scaled)
# Add cluster labels to original data
df_clustered = df.copy()
df_clustered['Cluster'] = cluster_labels
# Create interactive scatter plot
fig = px.scatter(df_clustered,
x='Murder',
y='Assault',
color='Cluster',
hover_data=['UrbanPop', 'Rape'],
title='State Crime Profiles')
st.plotly_chart(fig)
# Explain hyperparameters
st.markdown("""
### K-Means Hyperparameters Explained
1. **n_clusters (K)**
- The number of groups we want to create
- We chose K=4 based on the elbow method
- Each cluster represents a distinct crime profile
2. **random_state**
- Controls the random initialization of centroids
- Setting it to 42 ensures reproducible results
- Different values might give slightly different clusters
3. **n_init**
- Number of times to run the algorithm with different initial centroids
- We use 20 to find the best possible clustering
- Higher values give more reliable results but take longer
4. **max_iter**
- Maximum number of iterations for each run
- Default is 300, which is usually enough
- Algorithm stops earlier if it converges
5. **algorithm**
- 'auto': Automatically chooses the best algorithm
- 'full': Traditional K-means
- 'elkan': More efficient for well-separated clusters
""")
# Show cluster centers
st.subheader("Cluster Centers (Typical Crime Profiles)")
centers_df = pd.DataFrame(
kmeans.cluster_centers_,
columns=df.columns
)
st.dataframe(centers_df)
st.write("""
Each row represents the "average" crime profile for that cluster. For example:
- High values in Murder and Assault indicate a high-crime cluster
- High UrbanPop with low crime rates might indicate urban safety
- Low values across all metrics might indicate rural safety
""")
# Display cluster analysis
st.subheader("State Crime Profiles Analysis")
for cluster_num in range(k):
cluster_states = df_clustered[df_clustered['Cluster'] == cluster_num]
st.write(f"\n**CLUSTER {cluster_num}: {len(cluster_states)} states**")
st.write("States:", ", ".join(cluster_states.index.tolist()))
st.write("Average characteristics:")
avg_profile = cluster_states[['Murder', 'Assault', 'UrbanPop', 'Rape']].mean()
st.write(avg_profile)
# Explain the results
st.markdown("""
### Interpreting the Results
Each cluster represents a distinct crime profile:
1. **Cluster Characteristics**
- Look at the average values for each crime type
- Compare urban population percentages
- Identify the defining features of each cluster
2. **State Groupings**
- States in the same cluster have similar crime patterns
- Geographic proximity doesn't always mean similar profiles
- Some states might surprise you with their cluster membership
3. **Policy Implications**
- Clusters help identify states with similar challenges
- Can guide resource allocation and policy development
- Enables targeted interventions based on crime profiles
""")
# Exercise 5: Hierarchical Clustering
st.header("Exercise 5: Hierarchical Clustering Exploration")
# Code Example: Hierarchical Clustering
with st.expander("Code Example: Hierarchical Clustering"):
st.code("""
# Create hierarchical clustering
from scipy.cluster.hierarchy import linkage, dendrogram
# Create linkage matrix
linkage_matrix = linkage(df_scaled, method='complete')
# Plot dendrogram
import plotly.graph_objects as go
dendro = dendrogram(linkage_matrix, labels=df.index.tolist(), no_plot=True)
fig = go.Figure()
fig.add_trace(go.Scatter(
x=dendro['icoord'],
y=dendro['dcoord'],
mode='lines',
line=dict(color='white')
))
fig.update_layout(
title='State Crime Pattern Family Tree',
xaxis_title='States',
yaxis_title='Distance Between Groups'
)
fig.show()
# Cut the tree to get clusters
from scipy.cluster.hierarchy import fcluster
hierarchical_labels = fcluster(linkage_matrix, k, criterion='maxclust') - 1
""", language="python")
st.markdown("""
### What is Hierarchical Clustering?
Hierarchical clustering creates a tree-like structure (dendrogram) that shows how data points are related at different levels.
Think of it like building a family tree for states based on their crime patterns:
1. **Bottom-Up Approach (Agglomerative)**:
- Start with each state as its own cluster
- Find the two closest states and merge them
- Continue merging until all states are in one cluster
- Creates a complete hierarchy of relationships
2. **Distance Measurement**:
- Complete Linkage: Uses the maximum distance between states
- Average Linkage: Uses the average distance between states
- Single Linkage: Uses the minimum distance between states
- We use complete linkage for more distinct clusters
""")
# Create hierarchical clustering
linkage_matrix = linkage(df_scaled, method='complete')
# Create interactive dendrogram
fig_dendro = go.Figure()
dendro = dendrogram(linkage_matrix, labels=df.index.tolist(), no_plot=True)
fig_dendro.add_trace(go.Scatter(
x=dendro['icoord'],
y=dendro['dcoord'],
mode='lines',
line=dict(color='white')
))
fig_dendro.update_layout(
title='State Crime Pattern Family Tree',
xaxis_title='States',
yaxis_title='Distance Between Groups',
plot_bgcolor='rgb(30, 30, 30)',
paper_bgcolor='rgb(30, 30, 30)',
font=dict(color='white')
)
st.plotly_chart(fig_dendro)
# Explain how to read the dendrogram
st.markdown("""
### How to Read the Dendrogram
1. **Height of Connections**:
- Higher connections = more different groups
- Lower connections = more similar groups
- The height shows how different two groups are
2. **Cutting the Tree**:
- Draw a horizontal line to create clusters
- Where you cut determines the number of clusters
- We'll cut at a height that gives us 4 clusters (like K-means)
""")
# Cut the tree to get clusters
hierarchical_labels = fcluster(linkage_matrix, k, criterion='maxclust') - 1
# Compare K-means and Hierarchical Clustering
st.header("Comparing K-Means and Hierarchical Clustering")
# Create side-by-side comparison
col1, col2 = st.columns(2)
with col1:
st.subheader("K-Means Clustering")
fig_kmeans = px.scatter(df_clustered,
x='Murder',
y='Assault',
color='Cluster',
title='K-Means Clustering (K=4)',
hover_data=['UrbanPop', 'Rape'])
st.plotly_chart(fig_kmeans)
st.markdown("""
**K-Means Characteristics**:
- Requires specifying number of clusters upfront
- Creates clusters of similar size
- Works well with spherical clusters
- Faster for large datasets
- Can be sensitive to outliers
""")
with col2:
st.subheader("Hierarchical Clustering")
df_hierarchical = df.copy()
df_hierarchical['Cluster'] = hierarchical_labels
fig_hierarchical = px.scatter(df_hierarchical,
x='Murder',
y='Assault',
color='Cluster',
title='Hierarchical Clustering (4 clusters)',
hover_data=['UrbanPop', 'Rape'])
st.plotly_chart(fig_hierarchical)
st.markdown("""
**Hierarchical Clustering Characteristics**:
- Creates a complete hierarchy of clusters
- Can handle non-spherical clusters
- More flexible in cluster shapes
- Slower for large datasets
- Less sensitive to outliers
""")
# Show agreement between methods
st.subheader("Comparing the Results")
# Create comparison dataframe
comparison_df = pd.DataFrame({
'State': df.index,
'K-Means Cluster': cluster_labels,
'Hierarchical Cluster': hierarchical_labels
})
# Count agreements
agreements = sum(comparison_df['K-Means Cluster'] == comparison_df['Hierarchical Cluster'])
agreement_percentage = (agreements / len(comparison_df)) * 100
st.write(f"Methods agreed on {agreements} out of {len(comparison_df)} states ({agreement_percentage:.1f}%)")
# Show states where methods disagree
disagreements = comparison_df[comparison_df['K-Means Cluster'] != comparison_df['Hierarchical Cluster']]
if not disagreements.empty:
st.write("States where the methods disagreed:")
st.dataframe(disagreements)
st.markdown("""
### When to Use Each Method
1. **Use K-Means when**:
- You know the number of clusters
- Your data has spherical clusters
- You need fast computation
- You want clusters of similar size
2. **Use Hierarchical Clustering when**:
- You don't know the number of clusters
- You want to explore the hierarchy
- Your clusters might be non-spherical
- You need to handle outliers carefully
In our case, both methods found similar patterns, suggesting our clusters are robust!
""")
# Exercise 6: Policy Brief
st.header("Exercise 6: Policy Brief Creation")
# Code Example: Creating Final Visualizations
with st.expander("Code Example: Creating Policy Brief Visualizations"):
st.code("""
# Create a comprehensive visualization
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Create subplots
fig = make_subplots(rows=2, cols=2)
# Plot 1: Murder vs Assault by cluster
for i in range(k):
cluster_data = df_clustered[df_clustered['Cluster'] == i]
fig.add_trace(
go.Scatter(
x=cluster_data['Murder'],
y=cluster_data['Assault'],
mode='markers',
name=f'Cluster {i}'
),
row=1, col=1
)
# Plot 2: Urban Population vs Rape by cluster
for i in range(k):
cluster_data = df_clustered[df_clustered['Cluster'] == i]
fig.add_trace(
go.Scatter(
x=cluster_data['UrbanPop'],
y=cluster_data['Rape'],
mode='markers',
name=f'Cluster {i}'
),
row=1, col=2
)
# Update layout
fig.update_layout(
title_text="State Crime Profile Analysis",
showlegend=True
)
fig.show()
""", language="python")
st.write("""
Based on our analysis, here's a summary of findings and recommendations:
**Key Findings:**
- We identified distinct crime profiles among US states
- Each cluster represents a unique pattern of crime rates and urban population
- Some states show surprising similarities despite geographic distance
**Policy Recommendations:**
1. High-Priority States: Focus on states in high-crime clusters
2. Resource Allocation: Distribute federal crime prevention funds based on cluster profiles
3. Best Practice Sharing: Encourage states within the same cluster to share successful strategies
""")
# Additional Resources
st.header("Additional Resources")
st.write("""
- [Scikit-learn Clustering Documentation](https://scikit-learn.org/stable/modules/clustering.html)
- [KNN Documentation](https://scikit-learn.org/stable/modules/neighbors.html)
""")