import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots from sklearn.cluster import KMeans from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import StandardScaler from sklearn.metrics import silhouette_score from statsmodels.datasets import get_rdataset from scipy.cluster.hierarchy import dendrogram, linkage, fcluster # Set up the style for all plots plt.style.use('default') sns.set_theme(style="whitegrid", palette="husl") def load_arrests_data(): """Load and return the US Arrests dataset""" USArrests = get_rdataset('USArrests').data return USArrests def create_categorical_plot(df, column, target='Survived'): """Create an interactive plot for categorical variables""" fig = px.bar( df.groupby(column)[target].mean().reset_index(), x=column, y=target, title=f'Survival Rate by {column}', labels={target: 'Survival Rate', column: column}, color=target, color_continuous_scale='RdBu' ) fig.update_layout( plot_bgcolor='rgb(30, 30, 30)', paper_bgcolor='rgb(30, 30, 30)', font=dict(color='white') ) return fig def create_numeric_plot(df, column, target='Survived'): """Create an interactive plot for numeric variables""" fig = px.box( df, x=target, y=column, title=f'{column} Distribution by Survival', labels={target: 'Survived', column: column}, color=target, color_discrete_sequence=px.colors.qualitative.Set1 ) fig.update_layout( plot_bgcolor='rgb(30, 30, 30)', paper_bgcolor='rgb(30, 30, 30)', font=dict(color='white') ) return fig def show(): st.title("Week 7: Clustering Lab - State Crime Pattern Analysis") # Code Example: Loading and Basic Data Exploration with st.expander("Code Example: Loading and Exploring Data"): st.code(""" # Load the data from statsmodels.datasets import get_rdataset USArrests = get_rdataset('USArrests').data # Basic data exploration print("Dataset shape:", USArrests.shape) print("\\nVariables:", USArrests.columns.tolist()) print("\\nFirst 5 states:") print(USArrests.head()) # Basic statistics print("\\nData Summary:") print(USArrests.describe()) """, language="python") # Introduction Section with Learning Objectives st.header("Learning Objectives") st.markdown(""" In this week, you'll master: 1. **Unsupervised Learning**: Discover hidden patterns in crime data without predefined categories 2. **K-Means Clustering**: Learn to divide states into distinct safety profiles 3. **Hierarchical Clustering**: Create a "family tree" of state crime patterns 4. **Data Preprocessing**: Understand why scaling is crucial for fair comparisons """) # Interactive Overview st.header("Lab Overview") st.write(""" Welcome to your hands-on clustering lab! You'll be working as a policy analyst for the Department of Justice, analyzing crime patterns across US states. Your mission: discover hidden safety profiles that could inform federal resource allocation and crime prevention strategies. """) # Load Data st.header("Exercise 1: Data Detective Work") st.write("Let's start by understanding our dataset - the US Arrests data.") df = load_arrests_data() # Code Example: Data Visualization with st.expander("Code Example: Creating Visualizations"): st.code(""" # Create correlation heatmap import plotly.express as px fig = px.imshow(df.corr(), labels=dict(color="Correlation"), color_continuous_scale="RdBu") fig.show() # Create box plots fig = px.box(df, title="Data Distribution") fig.show() """, language="python") # Interactive Data Exploration col1, col2 = st.columns(2) with col1: st.subheader("Dataset Overview") st.write(f"Number of states: {len(df)}") st.write(f"Number of variables: {len(df.columns)}") st.write("\nVariables:", df.columns.tolist()) # Interactive data summary st.subheader("Data Summary") summary = df.describe() st.dataframe(summary) with col2: st.subheader("First 5 States") st.dataframe(df.head()) # Interactive correlation heatmap st.subheader("Correlation Heatmap") fig = px.imshow(df.corr(), labels=dict(color="Correlation"), color_continuous_scale="RdBu") st.plotly_chart(fig) # Exercise 2: Scaling Challenge st.header("Exercise 2: The Scaling Challenge") # Code Example: Data Scaling with st.expander("Code Example: Scaling Data"): st.code(""" # Import StandardScaler from sklearn.preprocessing import StandardScaler # Create and fit the scaler scaler = StandardScaler() df_scaled = scaler.fit_transform(df) # Convert back to DataFrame df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index) # Compare original vs scaled data print("Original data ranges:") print(df.describe()) print("\\nScaled data ranges:") print(df_scaled.describe()) """, language="python") # Explanation of scaling st.markdown(""" ### Why Do We Need Scaling? In our crime data, we have variables measured in very different scales: - Murder rates: typically 0-20 per 100,000 - Assault rates: typically 50-350 per 100,000 - Urban population: 0-100 percentage - Rape rates: typically 0-50 per 100,000 Without scaling, variables with larger numbers (like Assault) would dominate our analysis, making smaller-scale variables (like Murder) less influential. This would be like comparing dollars to cents - the cents would seem insignificant even if they were important! """) # Show original data ranges st.subheader("Original Data Ranges") col1, col2 = st.columns(2) with col1: # Create a bar chart of variances fig_var = px.bar( x=df.columns, y=df.var(), title="Variance of Each Variable (Before Scaling)", labels={'x': 'Crime Variables', 'y': 'Variance'}, color=df.var(), color_continuous_scale='Viridis' ) st.plotly_chart(fig_var) st.write(""" Notice how Assault has a much larger variance (6,945) compared to Murder (19). This means Assault would dominate our clustering if we didn't scale the data! """) with col2: # Create box plots of original data fig_box = px.box(df, title="Original Data Distribution") fig_box.update_layout( xaxis_title="Crime Variables", yaxis_title="Rate per 100,000" ) st.plotly_chart(fig_box) # Explain standardization st.markdown(""" ### What is Standardization? Standardization (also called Z-score normalization) transforms our data so that: 1. Each variable has a mean of 0 2. Each variable has a standard deviation of 1 The formula is: z = (x - μ) / σ - x is the original value - μ is the mean of the variable - σ is the standard deviation of the variable """) # Scale the data scaler = StandardScaler() df_scaled = scaler.fit_transform(df) df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index) # Show scaled data st.subheader("After Scaling") # Create box plots of scaled data fig_scaled = px.box(df_scaled, title="Scaled Data Distribution") fig_scaled.update_layout( xaxis_title="Crime Variables", yaxis_title="Standardized Values" ) st.plotly_chart(fig_scaled) st.write(""" After scaling, all variables are on the same scale: - Mean = 0 - Standard Deviation = 1 - Values typically range from -3 to +3 """) # Show before/after comparison for a few states st.write("### Before vs After Scaling (Sample States)") comparison_df = pd.DataFrame({ 'State': df.index[:5], 'Original Murder': df['Murder'][:5], 'Scaled Murder': df_scaled['Murder'][:5], 'Original Assault': df['Assault'][:5], 'Scaled Assault': df_scaled['Assault'][:5] }) st.dataframe(comparison_df) st.write(""" Notice how the relative differences between states are preserved, but now all variables contribute equally to our analysis! """) # Why scaling matters for clustering st.markdown(""" ### Why Scaling Matters for Clustering In clustering, we measure distances between data points. Without scaling: - States might be grouped together just because they have similar assault rates - Important differences in murder rates might be ignored With scaling: - All variables contribute equally to the distance calculations - We can find true patterns in the data, not just patterns in the largest numbers """) # Exercise 3: Finding Optimal Clusters st.header("Exercise 3: Finding the Right Number of Groups") # Code Example: Elbow Method with st.expander("Code Example: Finding Optimal K"): st.code(""" # Calculate inertias for different K values inertias = [] K_values = range(1, 11) for k in K_values: kmeans = KMeans(n_clusters=k, random_state=42, n_init=20) kmeans.fit(df_scaled) inertias.append(kmeans.inertia_) # Create elbow plot import plotly.graph_objects as go fig = go.Figure() fig.add_trace(go.Scatter( x=list(K_values), y=inertias, mode='lines+markers', name='Inertia' )) fig.update_layout( title='Finding the Optimal Number of Clusters', xaxis_title='Number of Clusters (K)', yaxis_title='Within-Cluster Sum of Squares' ) fig.show() """, language="python") st.markdown(""" ### The Elbow Method Explained The elbow method helps us find the optimal number of clusters (K) by looking at how the "within-cluster sum of squares" (WCSS) changes as we increase the number of clusters. Think of it like this: - **What is WCSS?** It's a measure of how spread out the points are within each cluster - **Lower WCSS** means points are closer to their cluster center (better clustering) - **Higher WCSS** means points are more spread out from their cluster center As we increase K: 1. WCSS always decreases (more clusters = tighter groups) 2. The rate of decrease slows down 3. We look for the "elbow" - where adding more clusters doesn't help much anymore """) # Calculate inertias for different K values inertias = [] K_values = range(1, 11) for k in K_values: kmeans = KMeans(n_clusters=k, random_state=42, n_init=20) kmeans.fit(df_scaled) inertias.append(kmeans.inertia_) # Create interactive elbow plot fig_elbow = go.Figure() fig_elbow.add_trace(go.Scatter( x=list(K_values), y=inertias, mode='lines+markers', name='Inertia' )) fig_elbow.update_layout( title='Finding the Optimal Number of State Crime Profiles', xaxis_title='Number of Clusters (K)', yaxis_title='Within-Cluster Sum of Squares', plot_bgcolor='rgb(30, 30, 30)', paper_bgcolor='rgb(30, 30, 30)', font=dict(color='white') ) st.plotly_chart(fig_elbow) # Interpretation guide st.markdown(""" ### How to Interpret the Elbow Plot Look at the plot above and ask yourself: 1. **Where is the "elbow"?** - The point where the line starts to level off - Adding more clusters doesn't give much improvement - In our case, it's around K=4 2. **What do the numbers mean?** - K=1: All states in one group (not useful) - K=2: Basic high/low crime split - K=3: More nuanced grouping - K=4: Our "elbow" - good balance of detail and simplicity - K>4: Diminishing returns - more complexity without much benefit 3. **Why not just use more clusters?** - More clusters = more complex to interpret - Small clusters might not be meaningful - Goal is to find the simplest model that captures the main patterns """) # Show the actual values st.write("### WCSS Values for Each K") wcss_df = pd.DataFrame({ 'Number of Clusters (K)': K_values, 'Within-Cluster Sum of Squares': inertias, 'Improvement from Previous K': [0] + [inertias[i-1] - inertias[i] for i in range(1, len(inertias))] }) st.dataframe(wcss_df) st.markdown(""" ### Making the Decision Based on our elbow plot and the numbers above: 1. The biggest improvements happen from K=1 to K=4 2. After K=4, the improvements get much smaller 3. K=4 gives us a good balance of: - Capturing meaningful patterns - Keeping the model simple enough to interpret - Having enough states in each cluster to be meaningful This is why we'll use K=4 for our clustering analysis! """) # Exercise 4: K-Means Clustering st.header("Exercise 4: K-Means State Profiling") # Code Example: K-Means Clustering with st.expander("Code Example: K-Means Implementation"): st.code(""" # Perform K-means clustering from sklearn.cluster import KMeans # Create and fit the model kmeans = KMeans( n_clusters=4, # Number of clusters random_state=42, # For reproducibility n_init=20 # Number of times to run with different centroids ) cluster_labels = kmeans.fit_predict(df_scaled) # Add cluster labels to original data df_clustered = df.copy() df_clustered['Cluster'] = cluster_labels # Visualize the clusters import plotly.express as px fig = px.scatter(df_clustered, x='Murder', y='Assault', color='Cluster', hover_data=['UrbanPop', 'Rape'], title='State Crime Profiles') fig.show() # Show cluster centers centers_df = pd.DataFrame( kmeans.cluster_centers_, columns=df.columns ) print("Cluster Centers:") print(centers_df) """, language="python") st.markdown(""" ### What is K-Means Clustering? K-means is an unsupervised learning algorithm that groups similar data points together. Think of it like organizing students into study groups based on their interests: 1. **Initialization**: - We randomly place K "centers" (centroids) in our data space - Each center represents the "average" of its cluster - In our case, each center represents a typical crime profile 2. **Assignment**: - Each state is assigned to its nearest center - "Nearest" is measured by Euclidean distance - States with similar crime patterns end up in the same cluster 3. **Update**: - Centers move to the average position of their assigned states - This process repeats until centers stop moving - The algorithm converges when states are optimally grouped """) # Visualize the process st.subheader("K-Means in Action") st.write(""" Let's see how K-means works with our state crime data. We'll use K=4 clusters to find distinct crime profiles. """) # Let user choose number of clusters k = st.slider("Choose number of clusters (K)", 2, 6, 4) # Perform K-means clustering kmeans = KMeans(n_clusters=k, random_state=42, n_init=20) cluster_labels = kmeans.fit_predict(df_scaled) # Add cluster labels to original data df_clustered = df.copy() df_clustered['Cluster'] = cluster_labels # Create interactive scatter plot fig = px.scatter(df_clustered, x='Murder', y='Assault', color='Cluster', hover_data=['UrbanPop', 'Rape'], title='State Crime Profiles') st.plotly_chart(fig) # Explain hyperparameters st.markdown(""" ### K-Means Hyperparameters Explained 1. **n_clusters (K)** - The number of groups we want to create - We chose K=4 based on the elbow method - Each cluster represents a distinct crime profile 2. **random_state** - Controls the random initialization of centroids - Setting it to 42 ensures reproducible results - Different values might give slightly different clusters 3. **n_init** - Number of times to run the algorithm with different initial centroids - We use 20 to find the best possible clustering - Higher values give more reliable results but take longer 4. **max_iter** - Maximum number of iterations for each run - Default is 300, which is usually enough - Algorithm stops earlier if it converges 5. **algorithm** - 'auto': Automatically chooses the best algorithm - 'full': Traditional K-means - 'elkan': More efficient for well-separated clusters """) # Show cluster centers st.subheader("Cluster Centers (Typical Crime Profiles)") centers_df = pd.DataFrame( kmeans.cluster_centers_, columns=df.columns ) st.dataframe(centers_df) st.write(""" Each row represents the "average" crime profile for that cluster. For example: - High values in Murder and Assault indicate a high-crime cluster - High UrbanPop with low crime rates might indicate urban safety - Low values across all metrics might indicate rural safety """) # Display cluster analysis st.subheader("State Crime Profiles Analysis") for cluster_num in range(k): cluster_states = df_clustered[df_clustered['Cluster'] == cluster_num] st.write(f"\n**CLUSTER {cluster_num}: {len(cluster_states)} states**") st.write("States:", ", ".join(cluster_states.index.tolist())) st.write("Average characteristics:") avg_profile = cluster_states[['Murder', 'Assault', 'UrbanPop', 'Rape']].mean() st.write(avg_profile) # Explain the results st.markdown(""" ### Interpreting the Results Each cluster represents a distinct crime profile: 1. **Cluster Characteristics** - Look at the average values for each crime type - Compare urban population percentages - Identify the defining features of each cluster 2. **State Groupings** - States in the same cluster have similar crime patterns - Geographic proximity doesn't always mean similar profiles - Some states might surprise you with their cluster membership 3. **Policy Implications** - Clusters help identify states with similar challenges - Can guide resource allocation and policy development - Enables targeted interventions based on crime profiles """) # Exercise 5: Hierarchical Clustering st.header("Exercise 5: Hierarchical Clustering Exploration") # Code Example: Hierarchical Clustering with st.expander("Code Example: Hierarchical Clustering"): st.code(""" # Create hierarchical clustering from scipy.cluster.hierarchy import linkage, dendrogram # Create linkage matrix linkage_matrix = linkage(df_scaled, method='complete') # Plot dendrogram import plotly.graph_objects as go dendro = dendrogram(linkage_matrix, labels=df.index.tolist(), no_plot=True) fig = go.Figure() fig.add_trace(go.Scatter( x=dendro['icoord'], y=dendro['dcoord'], mode='lines', line=dict(color='white') )) fig.update_layout( title='State Crime Pattern Family Tree', xaxis_title='States', yaxis_title='Distance Between Groups' ) fig.show() # Cut the tree to get clusters from scipy.cluster.hierarchy import fcluster hierarchical_labels = fcluster(linkage_matrix, k, criterion='maxclust') - 1 """, language="python") st.markdown(""" ### What is Hierarchical Clustering? Hierarchical clustering creates a tree-like structure (dendrogram) that shows how data points are related at different levels. Think of it like building a family tree for states based on their crime patterns: 1. **Bottom-Up Approach (Agglomerative)**: - Start with each state as its own cluster - Find the two closest states and merge them - Continue merging until all states are in one cluster - Creates a complete hierarchy of relationships 2. **Distance Measurement**: - Complete Linkage: Uses the maximum distance between states - Average Linkage: Uses the average distance between states - Single Linkage: Uses the minimum distance between states - We use complete linkage for more distinct clusters """) # Create hierarchical clustering linkage_matrix = linkage(df_scaled, method='complete') # Create interactive dendrogram fig_dendro = go.Figure() dendro = dendrogram(linkage_matrix, labels=df.index.tolist(), no_plot=True) fig_dendro.add_trace(go.Scatter( x=dendro['icoord'], y=dendro['dcoord'], mode='lines', line=dict(color='white') )) fig_dendro.update_layout( title='State Crime Pattern Family Tree', xaxis_title='States', yaxis_title='Distance Between Groups', plot_bgcolor='rgb(30, 30, 30)', paper_bgcolor='rgb(30, 30, 30)', font=dict(color='white') ) st.plotly_chart(fig_dendro) # Explain how to read the dendrogram st.markdown(""" ### How to Read the Dendrogram 1. **Height of Connections**: - Higher connections = more different groups - Lower connections = more similar groups - The height shows how different two groups are 2. **Cutting the Tree**: - Draw a horizontal line to create clusters - Where you cut determines the number of clusters - We'll cut at a height that gives us 4 clusters (like K-means) """) # Cut the tree to get clusters hierarchical_labels = fcluster(linkage_matrix, k, criterion='maxclust') - 1 # Compare K-means and Hierarchical Clustering st.header("Comparing K-Means and Hierarchical Clustering") # Create side-by-side comparison col1, col2 = st.columns(2) with col1: st.subheader("K-Means Clustering") fig_kmeans = px.scatter(df_clustered, x='Murder', y='Assault', color='Cluster', title='K-Means Clustering (K=4)', hover_data=['UrbanPop', 'Rape']) st.plotly_chart(fig_kmeans) st.markdown(""" **K-Means Characteristics**: - Requires specifying number of clusters upfront - Creates clusters of similar size - Works well with spherical clusters - Faster for large datasets - Can be sensitive to outliers """) with col2: st.subheader("Hierarchical Clustering") df_hierarchical = df.copy() df_hierarchical['Cluster'] = hierarchical_labels fig_hierarchical = px.scatter(df_hierarchical, x='Murder', y='Assault', color='Cluster', title='Hierarchical Clustering (4 clusters)', hover_data=['UrbanPop', 'Rape']) st.plotly_chart(fig_hierarchical) st.markdown(""" **Hierarchical Clustering Characteristics**: - Creates a complete hierarchy of clusters - Can handle non-spherical clusters - More flexible in cluster shapes - Slower for large datasets - Less sensitive to outliers """) # Show agreement between methods st.subheader("Comparing the Results") # Create comparison dataframe comparison_df = pd.DataFrame({ 'State': df.index, 'K-Means Cluster': cluster_labels, 'Hierarchical Cluster': hierarchical_labels }) # Count agreements agreements = sum(comparison_df['K-Means Cluster'] == comparison_df['Hierarchical Cluster']) agreement_percentage = (agreements / len(comparison_df)) * 100 st.write(f"Methods agreed on {agreements} out of {len(comparison_df)} states ({agreement_percentage:.1f}%)") # Show states where methods disagree disagreements = comparison_df[comparison_df['K-Means Cluster'] != comparison_df['Hierarchical Cluster']] if not disagreements.empty: st.write("States where the methods disagreed:") st.dataframe(disagreements) st.markdown(""" ### When to Use Each Method 1. **Use K-Means when**: - You know the number of clusters - Your data has spherical clusters - You need fast computation - You want clusters of similar size 2. **Use Hierarchical Clustering when**: - You don't know the number of clusters - You want to explore the hierarchy - Your clusters might be non-spherical - You need to handle outliers carefully In our case, both methods found similar patterns, suggesting our clusters are robust! """) # Exercise 6: Policy Brief st.header("Exercise 6: Policy Brief Creation") # Code Example: Creating Final Visualizations with st.expander("Code Example: Creating Policy Brief Visualizations"): st.code(""" # Create a comprehensive visualization import plotly.graph_objects as go from plotly.subplots import make_subplots # Create subplots fig = make_subplots(rows=2, cols=2) # Plot 1: Murder vs Assault by cluster for i in range(k): cluster_data = df_clustered[df_clustered['Cluster'] == i] fig.add_trace( go.Scatter( x=cluster_data['Murder'], y=cluster_data['Assault'], mode='markers', name=f'Cluster {i}' ), row=1, col=1 ) # Plot 2: Urban Population vs Rape by cluster for i in range(k): cluster_data = df_clustered[df_clustered['Cluster'] == i] fig.add_trace( go.Scatter( x=cluster_data['UrbanPop'], y=cluster_data['Rape'], mode='markers', name=f'Cluster {i}' ), row=1, col=2 ) # Update layout fig.update_layout( title_text="State Crime Profile Analysis", showlegend=True ) fig.show() """, language="python") st.write(""" Based on our analysis, here's a summary of findings and recommendations: **Key Findings:** - We identified distinct crime profiles among US states - Each cluster represents a unique pattern of crime rates and urban population - Some states show surprising similarities despite geographic distance **Policy Recommendations:** 1. High-Priority States: Focus on states in high-crime clusters 2. Resource Allocation: Distribute federal crime prevention funds based on cluster profiles 3. Best Practice Sharing: Encourage states within the same cluster to share successful strategies """) # Additional Resources st.header("Additional Resources") st.write(""" - [Scikit-learn Clustering Documentation](https://scikit-learn.org/stable/modules/clustering.html) - [KNN Documentation](https://scikit-learn.org/stable/modules/neighbors.html) """)