Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import plotly.express as px | |
import plotly.graph_objects as go | |
from plotly.subplots import make_subplots | |
from sklearn.cluster import KMeans | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.metrics import silhouette_score | |
from statsmodels.datasets import get_rdataset | |
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster | |
# Set up the style for all plots | |
plt.style.use('default') | |
sns.set_theme(style="whitegrid", palette="husl") | |
def load_arrests_data(): | |
"""Load and return the US Arrests dataset""" | |
USArrests = get_rdataset('USArrests').data | |
return USArrests | |
def create_categorical_plot(df, column, target='Survived'): | |
"""Create an interactive plot for categorical variables""" | |
fig = px.bar( | |
df.groupby(column)[target].mean().reset_index(), | |
x=column, | |
y=target, | |
title=f'Survival Rate by {column}', | |
labels={target: 'Survival Rate', column: column}, | |
color=target, | |
color_continuous_scale='RdBu' | |
) | |
fig.update_layout( | |
plot_bgcolor='rgb(30, 30, 30)', | |
paper_bgcolor='rgb(30, 30, 30)', | |
font=dict(color='white') | |
) | |
return fig | |
def create_numeric_plot(df, column, target='Survived'): | |
"""Create an interactive plot for numeric variables""" | |
fig = px.box( | |
df, | |
x=target, | |
y=column, | |
title=f'{column} Distribution by Survival', | |
labels={target: 'Survived', column: column}, | |
color=target, | |
color_discrete_sequence=px.colors.qualitative.Set1 | |
) | |
fig.update_layout( | |
plot_bgcolor='rgb(30, 30, 30)', | |
paper_bgcolor='rgb(30, 30, 30)', | |
font=dict(color='white') | |
) | |
return fig | |
def show(): | |
st.title("Week 7: Clustering Lab - State Crime Pattern Analysis") | |
# Code Example: Loading and Basic Data Exploration | |
with st.expander("Code Example: Loading and Exploring Data"): | |
st.code(""" | |
# Load the data | |
from statsmodels.datasets import get_rdataset | |
USArrests = get_rdataset('USArrests').data | |
# Basic data exploration | |
print("Dataset shape:", USArrests.shape) | |
print("\\nVariables:", USArrests.columns.tolist()) | |
print("\\nFirst 5 states:") | |
print(USArrests.head()) | |
# Basic statistics | |
print("\\nData Summary:") | |
print(USArrests.describe()) | |
""", language="python") | |
# Introduction Section with Learning Objectives | |
st.header("Learning Objectives") | |
st.markdown(""" | |
In this week, you'll master: | |
1. **Unsupervised Learning**: Discover hidden patterns in crime data without predefined categories | |
2. **K-Means Clustering**: Learn to divide states into distinct safety profiles | |
3. **Hierarchical Clustering**: Create a "family tree" of state crime patterns | |
4. **Data Preprocessing**: Understand why scaling is crucial for fair comparisons | |
""") | |
# Interactive Overview | |
st.header("Lab Overview") | |
st.write(""" | |
Welcome to your hands-on clustering lab! You'll be working as a policy analyst for the Department of Justice, | |
analyzing crime patterns across US states. Your mission: discover hidden safety profiles that could inform | |
federal resource allocation and crime prevention strategies. | |
""") | |
# Load Data | |
st.header("Exercise 1: Data Detective Work") | |
st.write("Let's start by understanding our dataset - the US Arrests data.") | |
df = load_arrests_data() | |
# Code Example: Data Visualization | |
with st.expander("Code Example: Creating Visualizations"): | |
st.code(""" | |
# Create correlation heatmap | |
import plotly.express as px | |
fig = px.imshow(df.corr(), | |
labels=dict(color="Correlation"), | |
color_continuous_scale="RdBu") | |
fig.show() | |
# Create box plots | |
fig = px.box(df, title="Data Distribution") | |
fig.show() | |
""", language="python") | |
# Interactive Data Exploration | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader("Dataset Overview") | |
st.write(f"Number of states: {len(df)}") | |
st.write(f"Number of variables: {len(df.columns)}") | |
st.write("\nVariables:", df.columns.tolist()) | |
# Interactive data summary | |
st.subheader("Data Summary") | |
summary = df.describe() | |
st.dataframe(summary) | |
with col2: | |
st.subheader("First 5 States") | |
st.dataframe(df.head()) | |
# Interactive correlation heatmap | |
st.subheader("Correlation Heatmap") | |
fig = px.imshow(df.corr(), | |
labels=dict(color="Correlation"), | |
color_continuous_scale="RdBu") | |
st.plotly_chart(fig) | |
# Exercise 2: Scaling Challenge | |
st.header("Exercise 2: The Scaling Challenge") | |
# Code Example: Data Scaling | |
with st.expander("Code Example: Scaling Data"): | |
st.code(""" | |
# Import StandardScaler | |
from sklearn.preprocessing import StandardScaler | |
# Create and fit the scaler | |
scaler = StandardScaler() | |
df_scaled = scaler.fit_transform(df) | |
# Convert back to DataFrame | |
df_scaled = pd.DataFrame(df_scaled, | |
columns=df.columns, | |
index=df.index) | |
# Compare original vs scaled data | |
print("Original data ranges:") | |
print(df.describe()) | |
print("\\nScaled data ranges:") | |
print(df_scaled.describe()) | |
""", language="python") | |
# Explanation of scaling | |
st.markdown(""" | |
### Why Do We Need Scaling? | |
In our crime data, we have variables measured in very different scales: | |
- Murder rates: typically 0-20 per 100,000 | |
- Assault rates: typically 50-350 per 100,000 | |
- Urban population: 0-100 percentage | |
- Rape rates: typically 0-50 per 100,000 | |
Without scaling, variables with larger numbers (like Assault) would dominate our analysis, | |
making smaller-scale variables (like Murder) less influential. This would be like comparing | |
dollars to cents - the cents would seem insignificant even if they were important! | |
""") | |
# Show original data ranges | |
st.subheader("Original Data Ranges") | |
col1, col2 = st.columns(2) | |
with col1: | |
# Create a bar chart of variances | |
fig_var = px.bar( | |
x=df.columns, | |
y=df.var(), | |
title="Variance of Each Variable (Before Scaling)", | |
labels={'x': 'Crime Variables', 'y': 'Variance'}, | |
color=df.var(), | |
color_continuous_scale='Viridis' | |
) | |
st.plotly_chart(fig_var) | |
st.write(""" | |
Notice how Assault has a much larger variance (6,945) compared to Murder (19). | |
This means Assault would dominate our clustering if we didn't scale the data! | |
""") | |
with col2: | |
# Create box plots of original data | |
fig_box = px.box(df, title="Original Data Distribution") | |
fig_box.update_layout( | |
xaxis_title="Crime Variables", | |
yaxis_title="Rate per 100,000" | |
) | |
st.plotly_chart(fig_box) | |
# Explain standardization | |
st.markdown(""" | |
### What is Standardization? | |
Standardization (also called Z-score normalization) transforms our data so that: | |
1. Each variable has a mean of 0 | |
2. Each variable has a standard deviation of 1 | |
The formula is: z = (x - μ) / σ | |
- x is the original value | |
- μ is the mean of the variable | |
- σ is the standard deviation of the variable | |
""") | |
# Scale the data | |
scaler = StandardScaler() | |
df_scaled = scaler.fit_transform(df) | |
df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index) | |
# Show scaled data | |
st.subheader("After Scaling") | |
# Create box plots of scaled data | |
fig_scaled = px.box(df_scaled, title="Scaled Data Distribution") | |
fig_scaled.update_layout( | |
xaxis_title="Crime Variables", | |
yaxis_title="Standardized Values" | |
) | |
st.plotly_chart(fig_scaled) | |
st.write(""" | |
After scaling, all variables are on the same scale: | |
- Mean = 0 | |
- Standard Deviation = 1 | |
- Values typically range from -3 to +3 | |
""") | |
# Show before/after comparison for a few states | |
st.write("### Before vs After Scaling (Sample States)") | |
comparison_df = pd.DataFrame({ | |
'State': df.index[:5], | |
'Original Murder': df['Murder'][:5], | |
'Scaled Murder': df_scaled['Murder'][:5], | |
'Original Assault': df['Assault'][:5], | |
'Scaled Assault': df_scaled['Assault'][:5] | |
}) | |
st.dataframe(comparison_df) | |
st.write(""" | |
Notice how the relative differences between states are preserved, | |
but now all variables contribute equally to our analysis! | |
""") | |
# Why scaling matters for clustering | |
st.markdown(""" | |
### Why Scaling Matters for Clustering | |
In clustering, we measure distances between data points. Without scaling: | |
- States might be grouped together just because they have similar assault rates | |
- Important differences in murder rates might be ignored | |
With scaling: | |
- All variables contribute equally to the distance calculations | |
- We can find true patterns in the data, not just patterns in the largest numbers | |
""") | |
# Exercise 3: Finding Optimal Clusters | |
st.header("Exercise 3: Finding the Right Number of Groups") | |
# Code Example: Elbow Method | |
with st.expander("Code Example: Finding Optimal K"): | |
st.code(""" | |
# Calculate inertias for different K values | |
inertias = [] | |
K_values = range(1, 11) | |
for k in K_values: | |
kmeans = KMeans(n_clusters=k, random_state=42, n_init=20) | |
kmeans.fit(df_scaled) | |
inertias.append(kmeans.inertia_) | |
# Create elbow plot | |
import plotly.graph_objects as go | |
fig = go.Figure() | |
fig.add_trace(go.Scatter( | |
x=list(K_values), | |
y=inertias, | |
mode='lines+markers', | |
name='Inertia' | |
)) | |
fig.update_layout( | |
title='Finding the Optimal Number of Clusters', | |
xaxis_title='Number of Clusters (K)', | |
yaxis_title='Within-Cluster Sum of Squares' | |
) | |
fig.show() | |
""", language="python") | |
st.markdown(""" | |
### The Elbow Method Explained | |
The elbow method helps us find the optimal number of clusters (K) by looking at how the "within-cluster sum of squares" | |
(WCSS) changes as we increase the number of clusters. Think of it like this: | |
- **What is WCSS?** It's a measure of how spread out the points are within each cluster | |
- **Lower WCSS** means points are closer to their cluster center (better clustering) | |
- **Higher WCSS** means points are more spread out from their cluster center | |
As we increase K: | |
1. WCSS always decreases (more clusters = tighter groups) | |
2. The rate of decrease slows down | |
3. We look for the "elbow" - where adding more clusters doesn't help much anymore | |
""") | |
# Calculate inertias for different K values | |
inertias = [] | |
K_values = range(1, 11) | |
for k in K_values: | |
kmeans = KMeans(n_clusters=k, random_state=42, n_init=20) | |
kmeans.fit(df_scaled) | |
inertias.append(kmeans.inertia_) | |
# Create interactive elbow plot | |
fig_elbow = go.Figure() | |
fig_elbow.add_trace(go.Scatter( | |
x=list(K_values), | |
y=inertias, | |
mode='lines+markers', | |
name='Inertia' | |
)) | |
fig_elbow.update_layout( | |
title='Finding the Optimal Number of State Crime Profiles', | |
xaxis_title='Number of Clusters (K)', | |
yaxis_title='Within-Cluster Sum of Squares', | |
plot_bgcolor='rgb(30, 30, 30)', | |
paper_bgcolor='rgb(30, 30, 30)', | |
font=dict(color='white') | |
) | |
st.plotly_chart(fig_elbow) | |
# Interpretation guide | |
st.markdown(""" | |
### How to Interpret the Elbow Plot | |
Look at the plot above and ask yourself: | |
1. **Where is the "elbow"?** | |
- The point where the line starts to level off | |
- Adding more clusters doesn't give much improvement | |
- In our case, it's around K=4 | |
2. **What do the numbers mean?** | |
- K=1: All states in one group (not useful) | |
- K=2: Basic high/low crime split | |
- K=3: More nuanced grouping | |
- K=4: Our "elbow" - good balance of detail and simplicity | |
- K>4: Diminishing returns - more complexity without much benefit | |
3. **Why not just use more clusters?** | |
- More clusters = more complex to interpret | |
- Small clusters might not be meaningful | |
- Goal is to find the simplest model that captures the main patterns | |
""") | |
# Show the actual values | |
st.write("### WCSS Values for Each K") | |
wcss_df = pd.DataFrame({ | |
'Number of Clusters (K)': K_values, | |
'Within-Cluster Sum of Squares': inertias, | |
'Improvement from Previous K': [0] + [inertias[i-1] - inertias[i] for i in range(1, len(inertias))] | |
}) | |
st.dataframe(wcss_df) | |
st.markdown(""" | |
### Making the Decision | |
Based on our elbow plot and the numbers above: | |
1. The biggest improvements happen from K=1 to K=4 | |
2. After K=4, the improvements get much smaller | |
3. K=4 gives us a good balance of: | |
- Capturing meaningful patterns | |
- Keeping the model simple enough to interpret | |
- Having enough states in each cluster to be meaningful | |
This is why we'll use K=4 for our clustering analysis! | |
""") | |
# Exercise 4: K-Means Clustering | |
st.header("Exercise 4: K-Means State Profiling") | |
# Code Example: K-Means Clustering | |
with st.expander("Code Example: K-Means Implementation"): | |
st.code(""" | |
# Perform K-means clustering | |
from sklearn.cluster import KMeans | |
# Create and fit the model | |
kmeans = KMeans( | |
n_clusters=4, # Number of clusters | |
random_state=42, # For reproducibility | |
n_init=20 # Number of times to run with different centroids | |
) | |
cluster_labels = kmeans.fit_predict(df_scaled) | |
# Add cluster labels to original data | |
df_clustered = df.copy() | |
df_clustered['Cluster'] = cluster_labels | |
# Visualize the clusters | |
import plotly.express as px | |
fig = px.scatter(df_clustered, | |
x='Murder', | |
y='Assault', | |
color='Cluster', | |
hover_data=['UrbanPop', 'Rape'], | |
title='State Crime Profiles') | |
fig.show() | |
# Show cluster centers | |
centers_df = pd.DataFrame( | |
kmeans.cluster_centers_, | |
columns=df.columns | |
) | |
print("Cluster Centers:") | |
print(centers_df) | |
""", language="python") | |
st.markdown(""" | |
### What is K-Means Clustering? | |
K-means is an unsupervised learning algorithm that groups similar data points together. Think of it like organizing | |
students into study groups based on their interests: | |
1. **Initialization**: | |
- We randomly place K "centers" (centroids) in our data space | |
- Each center represents the "average" of its cluster | |
- In our case, each center represents a typical crime profile | |
2. **Assignment**: | |
- Each state is assigned to its nearest center | |
- "Nearest" is measured by Euclidean distance | |
- States with similar crime patterns end up in the same cluster | |
3. **Update**: | |
- Centers move to the average position of their assigned states | |
- This process repeats until centers stop moving | |
- The algorithm converges when states are optimally grouped | |
""") | |
# Visualize the process | |
st.subheader("K-Means in Action") | |
st.write(""" | |
Let's see how K-means works with our state crime data. We'll use K=4 clusters to find distinct crime profiles. | |
""") | |
# Let user choose number of clusters | |
k = st.slider("Choose number of clusters (K)", 2, 6, 4) | |
# Perform K-means clustering | |
kmeans = KMeans(n_clusters=k, random_state=42, n_init=20) | |
cluster_labels = kmeans.fit_predict(df_scaled) | |
# Add cluster labels to original data | |
df_clustered = df.copy() | |
df_clustered['Cluster'] = cluster_labels | |
# Create interactive scatter plot | |
fig = px.scatter(df_clustered, | |
x='Murder', | |
y='Assault', | |
color='Cluster', | |
hover_data=['UrbanPop', 'Rape'], | |
title='State Crime Profiles') | |
st.plotly_chart(fig) | |
# Explain hyperparameters | |
st.markdown(""" | |
### K-Means Hyperparameters Explained | |
1. **n_clusters (K)** | |
- The number of groups we want to create | |
- We chose K=4 based on the elbow method | |
- Each cluster represents a distinct crime profile | |
2. **random_state** | |
- Controls the random initialization of centroids | |
- Setting it to 42 ensures reproducible results | |
- Different values might give slightly different clusters | |
3. **n_init** | |
- Number of times to run the algorithm with different initial centroids | |
- We use 20 to find the best possible clustering | |
- Higher values give more reliable results but take longer | |
4. **max_iter** | |
- Maximum number of iterations for each run | |
- Default is 300, which is usually enough | |
- Algorithm stops earlier if it converges | |
5. **algorithm** | |
- 'auto': Automatically chooses the best algorithm | |
- 'full': Traditional K-means | |
- 'elkan': More efficient for well-separated clusters | |
""") | |
# Show cluster centers | |
st.subheader("Cluster Centers (Typical Crime Profiles)") | |
centers_df = pd.DataFrame( | |
kmeans.cluster_centers_, | |
columns=df.columns | |
) | |
st.dataframe(centers_df) | |
st.write(""" | |
Each row represents the "average" crime profile for that cluster. For example: | |
- High values in Murder and Assault indicate a high-crime cluster | |
- High UrbanPop with low crime rates might indicate urban safety | |
- Low values across all metrics might indicate rural safety | |
""") | |
# Display cluster analysis | |
st.subheader("State Crime Profiles Analysis") | |
for cluster_num in range(k): | |
cluster_states = df_clustered[df_clustered['Cluster'] == cluster_num] | |
st.write(f"\n**CLUSTER {cluster_num}: {len(cluster_states)} states**") | |
st.write("States:", ", ".join(cluster_states.index.tolist())) | |
st.write("Average characteristics:") | |
avg_profile = cluster_states[['Murder', 'Assault', 'UrbanPop', 'Rape']].mean() | |
st.write(avg_profile) | |
# Explain the results | |
st.markdown(""" | |
### Interpreting the Results | |
Each cluster represents a distinct crime profile: | |
1. **Cluster Characteristics** | |
- Look at the average values for each crime type | |
- Compare urban population percentages | |
- Identify the defining features of each cluster | |
2. **State Groupings** | |
- States in the same cluster have similar crime patterns | |
- Geographic proximity doesn't always mean similar profiles | |
- Some states might surprise you with their cluster membership | |
3. **Policy Implications** | |
- Clusters help identify states with similar challenges | |
- Can guide resource allocation and policy development | |
- Enables targeted interventions based on crime profiles | |
""") | |
# Exercise 5: Hierarchical Clustering | |
st.header("Exercise 5: Hierarchical Clustering Exploration") | |
# Code Example: Hierarchical Clustering | |
with st.expander("Code Example: Hierarchical Clustering"): | |
st.code(""" | |
# Create hierarchical clustering | |
from scipy.cluster.hierarchy import linkage, dendrogram | |
# Create linkage matrix | |
linkage_matrix = linkage(df_scaled, method='complete') | |
# Plot dendrogram | |
import plotly.graph_objects as go | |
dendro = dendrogram(linkage_matrix, labels=df.index.tolist(), no_plot=True) | |
fig = go.Figure() | |
fig.add_trace(go.Scatter( | |
x=dendro['icoord'], | |
y=dendro['dcoord'], | |
mode='lines', | |
line=dict(color='white') | |
)) | |
fig.update_layout( | |
title='State Crime Pattern Family Tree', | |
xaxis_title='States', | |
yaxis_title='Distance Between Groups' | |
) | |
fig.show() | |
# Cut the tree to get clusters | |
from scipy.cluster.hierarchy import fcluster | |
hierarchical_labels = fcluster(linkage_matrix, k, criterion='maxclust') - 1 | |
""", language="python") | |
st.markdown(""" | |
### What is Hierarchical Clustering? | |
Hierarchical clustering creates a tree-like structure (dendrogram) that shows how data points are related at different levels. | |
Think of it like building a family tree for states based on their crime patterns: | |
1. **Bottom-Up Approach (Agglomerative)**: | |
- Start with each state as its own cluster | |
- Find the two closest states and merge them | |
- Continue merging until all states are in one cluster | |
- Creates a complete hierarchy of relationships | |
2. **Distance Measurement**: | |
- Complete Linkage: Uses the maximum distance between states | |
- Average Linkage: Uses the average distance between states | |
- Single Linkage: Uses the minimum distance between states | |
- We use complete linkage for more distinct clusters | |
""") | |
# Create hierarchical clustering | |
linkage_matrix = linkage(df_scaled, method='complete') | |
# Create interactive dendrogram | |
fig_dendro = go.Figure() | |
dendro = dendrogram(linkage_matrix, labels=df.index.tolist(), no_plot=True) | |
fig_dendro.add_trace(go.Scatter( | |
x=dendro['icoord'], | |
y=dendro['dcoord'], | |
mode='lines', | |
line=dict(color='white') | |
)) | |
fig_dendro.update_layout( | |
title='State Crime Pattern Family Tree', | |
xaxis_title='States', | |
yaxis_title='Distance Between Groups', | |
plot_bgcolor='rgb(30, 30, 30)', | |
paper_bgcolor='rgb(30, 30, 30)', | |
font=dict(color='white') | |
) | |
st.plotly_chart(fig_dendro) | |
# Explain how to read the dendrogram | |
st.markdown(""" | |
### How to Read the Dendrogram | |
1. **Height of Connections**: | |
- Higher connections = more different groups | |
- Lower connections = more similar groups | |
- The height shows how different two groups are | |
2. **Cutting the Tree**: | |
- Draw a horizontal line to create clusters | |
- Where you cut determines the number of clusters | |
- We'll cut at a height that gives us 4 clusters (like K-means) | |
""") | |
# Cut the tree to get clusters | |
hierarchical_labels = fcluster(linkage_matrix, k, criterion='maxclust') - 1 | |
# Compare K-means and Hierarchical Clustering | |
st.header("Comparing K-Means and Hierarchical Clustering") | |
# Create side-by-side comparison | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader("K-Means Clustering") | |
fig_kmeans = px.scatter(df_clustered, | |
x='Murder', | |
y='Assault', | |
color='Cluster', | |
title='K-Means Clustering (K=4)', | |
hover_data=['UrbanPop', 'Rape']) | |
st.plotly_chart(fig_kmeans) | |
st.markdown(""" | |
**K-Means Characteristics**: | |
- Requires specifying number of clusters upfront | |
- Creates clusters of similar size | |
- Works well with spherical clusters | |
- Faster for large datasets | |
- Can be sensitive to outliers | |
""") | |
with col2: | |
st.subheader("Hierarchical Clustering") | |
df_hierarchical = df.copy() | |
df_hierarchical['Cluster'] = hierarchical_labels | |
fig_hierarchical = px.scatter(df_hierarchical, | |
x='Murder', | |
y='Assault', | |
color='Cluster', | |
title='Hierarchical Clustering (4 clusters)', | |
hover_data=['UrbanPop', 'Rape']) | |
st.plotly_chart(fig_hierarchical) | |
st.markdown(""" | |
**Hierarchical Clustering Characteristics**: | |
- Creates a complete hierarchy of clusters | |
- Can handle non-spherical clusters | |
- More flexible in cluster shapes | |
- Slower for large datasets | |
- Less sensitive to outliers | |
""") | |
# Show agreement between methods | |
st.subheader("Comparing the Results") | |
# Create comparison dataframe | |
comparison_df = pd.DataFrame({ | |
'State': df.index, | |
'K-Means Cluster': cluster_labels, | |
'Hierarchical Cluster': hierarchical_labels | |
}) | |
# Count agreements | |
agreements = sum(comparison_df['K-Means Cluster'] == comparison_df['Hierarchical Cluster']) | |
agreement_percentage = (agreements / len(comparison_df)) * 100 | |
st.write(f"Methods agreed on {agreements} out of {len(comparison_df)} states ({agreement_percentage:.1f}%)") | |
# Show states where methods disagree | |
disagreements = comparison_df[comparison_df['K-Means Cluster'] != comparison_df['Hierarchical Cluster']] | |
if not disagreements.empty: | |
st.write("States where the methods disagreed:") | |
st.dataframe(disagreements) | |
st.markdown(""" | |
### When to Use Each Method | |
1. **Use K-Means when**: | |
- You know the number of clusters | |
- Your data has spherical clusters | |
- You need fast computation | |
- You want clusters of similar size | |
2. **Use Hierarchical Clustering when**: | |
- You don't know the number of clusters | |
- You want to explore the hierarchy | |
- Your clusters might be non-spherical | |
- You need to handle outliers carefully | |
In our case, both methods found similar patterns, suggesting our clusters are robust! | |
""") | |
# Exercise 6: Policy Brief | |
st.header("Exercise 6: Policy Brief Creation") | |
# Code Example: Creating Final Visualizations | |
with st.expander("Code Example: Creating Policy Brief Visualizations"): | |
st.code(""" | |
# Create a comprehensive visualization | |
import plotly.graph_objects as go | |
from plotly.subplots import make_subplots | |
# Create subplots | |
fig = make_subplots(rows=2, cols=2) | |
# Plot 1: Murder vs Assault by cluster | |
for i in range(k): | |
cluster_data = df_clustered[df_clustered['Cluster'] == i] | |
fig.add_trace( | |
go.Scatter( | |
x=cluster_data['Murder'], | |
y=cluster_data['Assault'], | |
mode='markers', | |
name=f'Cluster {i}' | |
), | |
row=1, col=1 | |
) | |
# Plot 2: Urban Population vs Rape by cluster | |
for i in range(k): | |
cluster_data = df_clustered[df_clustered['Cluster'] == i] | |
fig.add_trace( | |
go.Scatter( | |
x=cluster_data['UrbanPop'], | |
y=cluster_data['Rape'], | |
mode='markers', | |
name=f'Cluster {i}' | |
), | |
row=1, col=2 | |
) | |
# Update layout | |
fig.update_layout( | |
title_text="State Crime Profile Analysis", | |
showlegend=True | |
) | |
fig.show() | |
""", language="python") | |
st.write(""" | |
Based on our analysis, here's a summary of findings and recommendations: | |
**Key Findings:** | |
- We identified distinct crime profiles among US states | |
- Each cluster represents a unique pattern of crime rates and urban population | |
- Some states show surprising similarities despite geographic distance | |
**Policy Recommendations:** | |
1. High-Priority States: Focus on states in high-crime clusters | |
2. Resource Allocation: Distribute federal crime prevention funds based on cluster profiles | |
3. Best Practice Sharing: Encourage states within the same cluster to share successful strategies | |
""") | |
# Additional Resources | |
st.header("Additional Resources") | |
st.write(""" | |
- [Scikit-learn Clustering Documentation](https://scikit-learn.org/stable/modules/clustering.html) | |
- [KNN Documentation](https://scikit-learn.org/stable/modules/neighbors.html) | |
""") |