Spaces:

raymondEDS
/

DS_webclass

Running

raymondEDS

fixing navigation

1ecc668 2 months ago

27.9 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	from sklearn.cluster import KMeans
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.preprocessing import StandardScaler
	from sklearn.metrics import silhouette_score
	from statsmodels.datasets import get_rdataset
	from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

	# Set up the style for all plots
	plt.style.use('default')
	sns.set_theme(style="whitegrid", palette="husl")

	def load_arrests_data():
	"""Load and return the US Arrests dataset"""
	USArrests = get_rdataset('USArrests').data
	return USArrests

	def create_categorical_plot(df, column, target='Survived'):
	"""Create an interactive plot for categorical variables"""
	fig = px.bar(
	df.groupby(column)[target].mean().reset_index(),
	x=column,
	y=target,
	title=f'Survival Rate by {column}',
	labels={target: 'Survival Rate', column: column},
	color=target,
	color_continuous_scale='RdBu'
	)
	fig.update_layout(
	plot_bgcolor='rgb(30, 30, 30)',
	paper_bgcolor='rgb(30, 30, 30)',
	font=dict(color='white')
	)
	return fig

	def create_numeric_plot(df, column, target='Survived'):
	"""Create an interactive plot for numeric variables"""
	fig = px.box(
	df,
	x=target,
	y=column,
	title=f'{column} Distribution by Survival',
	labels={target: 'Survived', column: column},
	color=target,
	color_discrete_sequence=px.colors.qualitative.Set1
	)
	fig.update_layout(
	plot_bgcolor='rgb(30, 30, 30)',
	paper_bgcolor='rgb(30, 30, 30)',
	font=dict(color='white')
	)
	return fig

	def show():
	st.title("Week 7: Clustering Lab - State Crime Pattern Analysis")

	# Code Example: Loading and Basic Data Exploration
	with st.expander("Code Example: Loading and Exploring Data"):
	st.code("""
	# Load the data
	from statsmodels.datasets import get_rdataset
	USArrests = get_rdataset('USArrests').data

	# Basic data exploration
	print("Dataset shape:", USArrests.shape)
	print("\\nVariables:", USArrests.columns.tolist())
	print("\\nFirst 5 states:")
	print(USArrests.head())

	# Basic statistics
	print("\\nData Summary:")
	print(USArrests.describe())
	""", language="python")

	# Introduction Section with Learning Objectives
	st.header("Learning Objectives")
	st.markdown("""
	In this week, you'll master:
	1. Unsupervised Learning: Discover hidden patterns in crime data without predefined categories
	2. K-Means Clustering: Learn to divide states into distinct safety profiles
	3. Hierarchical Clustering: Create a "family tree" of state crime patterns
	4. Data Preprocessing: Understand why scaling is crucial for fair comparisons
	""")

	# Interactive Overview
	st.header("Lab Overview")
	st.write("""
	Welcome to your hands-on clustering lab! You'll be working as a policy analyst for the Department of Justice,
	analyzing crime patterns across US states. Your mission: discover hidden safety profiles that could inform
	federal resource allocation and crime prevention strategies.
	""")

	# Load Data
	st.header("Exercise 1: Data Detective Work")
	st.write("Let's start by understanding our dataset - the US Arrests data.")

	df = load_arrests_data()

	# Code Example: Data Visualization
	with st.expander("Code Example: Creating Visualizations"):
	st.code("""
	# Create correlation heatmap
	import plotly.express as px
	fig = px.imshow(df.corr(),
	labels=dict(color="Correlation"),
	color_continuous_scale="RdBu")
	fig.show()

	# Create box plots
	fig = px.box(df, title="Data Distribution")
	fig.show()
	""", language="python")

	# Interactive Data Exploration
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("Dataset Overview")
	st.write(f"Number of states: {len(df)}")
	st.write(f"Number of variables: {len(df.columns)}")
	st.write("\nVariables:", df.columns.tolist())

	# Interactive data summary
	st.subheader("Data Summary")
	summary = df.describe()
	st.dataframe(summary)

	with col2:
	st.subheader("First 5 States")
	st.dataframe(df.head())

	# Interactive correlation heatmap
	st.subheader("Correlation Heatmap")
	fig = px.imshow(df.corr(),
	labels=dict(color="Correlation"),
	color_continuous_scale="RdBu")
	st.plotly_chart(fig)

	# Exercise 2: Scaling Challenge
	st.header("Exercise 2: The Scaling Challenge")

	# Code Example: Data Scaling
	with st.expander("Code Example: Scaling Data"):
	st.code("""
	# Import StandardScaler
	from sklearn.preprocessing import StandardScaler

	# Create and fit the scaler
	scaler = StandardScaler()
	df_scaled = scaler.fit_transform(df)

	# Convert back to DataFrame
	df_scaled = pd.DataFrame(df_scaled,
	columns=df.columns,
	index=df.index)

	# Compare original vs scaled data
	print("Original data ranges:")
	print(df.describe())
	print("\\nScaled data ranges:")
	print(df_scaled.describe())
	""", language="python")

	# Explanation of scaling
	st.markdown("""
	### Why Do We Need Scaling?

	In our crime data, we have variables measured in very different scales:
	- Murder rates: typically 0-20 per 100,000
	- Assault rates: typically 50-350 per 100,000
	- Urban population: 0-100 percentage
	- Rape rates: typically 0-50 per 100,000

	Without scaling, variables with larger numbers (like Assault) would dominate our analysis,
	making smaller-scale variables (like Murder) less influential. This would be like comparing
	dollars to cents - the cents would seem insignificant even if they were important!
	""")

	# Show original data ranges
	st.subheader("Original Data Ranges")
	col1, col2 = st.columns(2)

	with col1:
	# Create a bar chart of variances
	fig_var = px.bar(
	x=df.columns,
	y=df.var(),
	title="Variance of Each Variable (Before Scaling)",
	labels={'x': 'Crime Variables', 'y': 'Variance'},
	color=df.var(),
	color_continuous_scale='Viridis'
	)
	st.plotly_chart(fig_var)

	st.write("""
	Notice how Assault has a much larger variance (6,945) compared to Murder (19).
	This means Assault would dominate our clustering if we didn't scale the data!
	""")

	with col2:
	# Create box plots of original data
	fig_box = px.box(df, title="Original Data Distribution")
	fig_box.update_layout(
	xaxis_title="Crime Variables",
	yaxis_title="Rate per 100,000"
	)
	st.plotly_chart(fig_box)

	# Explain standardization
	st.markdown("""
	### What is Standardization?

	Standardization (also called Z-score normalization) transforms our data so that:
	1. Each variable has a mean of 0
	2. Each variable has a standard deviation of 1

	The formula is: z = (x - μ) / σ
	- x is the original value
	- μ is the mean of the variable
	- σ is the standard deviation of the variable
	""")

	# Scale the data
	scaler = StandardScaler()
	df_scaled = scaler.fit_transform(df)
	df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)

	# Show scaled data
	st.subheader("After Scaling")

	# Create box plots of scaled data
	fig_scaled = px.box(df_scaled, title="Scaled Data Distribution")
	fig_scaled.update_layout(
	xaxis_title="Crime Variables",
	yaxis_title="Standardized Values"
	)
	st.plotly_chart(fig_scaled)

	st.write("""
	After scaling, all variables are on the same scale:
	- Mean = 0
	- Standard Deviation = 1
	- Values typically range from -3 to +3
	""")

	# Show before/after comparison for a few states
	st.write("### Before vs After Scaling (Sample States)")
	comparison_df = pd.DataFrame({
	'State': df.index[:5],
	'Original Murder': df['Murder'][:5],
	'Scaled Murder': df_scaled['Murder'][:5],
	'Original Assault': df['Assault'][:5],
	'Scaled Assault': df_scaled['Assault'][:5]
	})
	st.dataframe(comparison_df)

	st.write("""
	Notice how the relative differences between states are preserved,
	but now all variables contribute equally to our analysis!
	""")

	# Why scaling matters for clustering
	st.markdown("""
	### Why Scaling Matters for Clustering

	In clustering, we measure distances between data points. Without scaling:
	- States might be grouped together just because they have similar assault rates
	- Important differences in murder rates might be ignored

	With scaling:
	- All variables contribute equally to the distance calculations
	- We can find true patterns in the data, not just patterns in the largest numbers
	""")

	# Exercise 3: Finding Optimal Clusters
	st.header("Exercise 3: Finding the Right Number of Groups")

	# Code Example: Elbow Method
	with st.expander("Code Example: Finding Optimal K"):
	st.code("""
	# Calculate inertias for different K values
	inertias = []
	K_values = range(1, 11)

	for k in K_values:
	kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
	kmeans.fit(df_scaled)
	inertias.append(kmeans.inertia_)

	# Create elbow plot
	import plotly.graph_objects as go
	fig = go.Figure()
	fig.add_trace(go.Scatter(
	x=list(K_values),
	y=inertias,
	mode='lines+markers',
	name='Inertia'
	))
	fig.update_layout(
	title='Finding the Optimal Number of Clusters',
	xaxis_title='Number of Clusters (K)',
	yaxis_title='Within-Cluster Sum of Squares'
	)
	fig.show()
	""", language="python")

	st.markdown("""
	### The Elbow Method Explained

	The elbow method helps us find the optimal number of clusters (K) by looking at how the "within-cluster sum of squares"
	(WCSS) changes as we increase the number of clusters. Think of it like this:

	- What is WCSS? It's a measure of how spread out the points are within each cluster
	- Lower WCSS means points are closer to their cluster center (better clustering)
	- Higher WCSS means points are more spread out from their cluster center

	As we increase K:
	1. WCSS always decreases (more clusters = tighter groups)
	2. The rate of decrease slows down
	3. We look for the "elbow" - where adding more clusters doesn't help much anymore
	""")

	# Calculate inertias for different K values
	inertias = []
	K_values = range(1, 11)

	for k in K_values:
	kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
	kmeans.fit(df_scaled)
	inertias.append(kmeans.inertia_)

	# Create interactive elbow plot
	fig_elbow = go.Figure()
	fig_elbow.add_trace(go.Scatter(
	x=list(K_values),
	y=inertias,
	mode='lines+markers',
	name='Inertia'
	))
	fig_elbow.update_layout(
	title='Finding the Optimal Number of State Crime Profiles',
	xaxis_title='Number of Clusters (K)',
	yaxis_title='Within-Cluster Sum of Squares',
	plot_bgcolor='rgb(30, 30, 30)',
	paper_bgcolor='rgb(30, 30, 30)',
	font=dict(color='white')
	)
	st.plotly_chart(fig_elbow)

	# Interpretation guide
	st.markdown("""
	### How to Interpret the Elbow Plot

	Look at the plot above and ask yourself:
	1. Where is the "elbow"?
	- The point where the line starts to level off
	- Adding more clusters doesn't give much improvement
	- In our case, it's around K=4

	2. What do the numbers mean?
	- K=1: All states in one group (not useful)
	- K=2: Basic high/low crime split
	- K=3: More nuanced grouping
	- K=4: Our "elbow" - good balance of detail and simplicity
	- K>4: Diminishing returns - more complexity without much benefit

	3. Why not just use more clusters?
	- More clusters = more complex to interpret
	- Small clusters might not be meaningful
	- Goal is to find the simplest model that captures the main patterns
	""")

	# Show the actual values
	st.write("### WCSS Values for Each K")
	wcss_df = pd.DataFrame({
	'Number of Clusters (K)': K_values,
	'Within-Cluster Sum of Squares': inertias,
	'Improvement from Previous K': [0] + [inertias[i-1] - inertias[i] for i in range(1, len(inertias))]
	})
	st.dataframe(wcss_df)

	st.markdown("""
	### Making the Decision

	Based on our elbow plot and the numbers above:
	1. The biggest improvements happen from K=1 to K=4
	2. After K=4, the improvements get much smaller
	3. K=4 gives us a good balance of:
	- Capturing meaningful patterns
	- Keeping the model simple enough to interpret
	- Having enough states in each cluster to be meaningful

	This is why we'll use K=4 for our clustering analysis!
	""")

	# Exercise 4: K-Means Clustering
	st.header("Exercise 4: K-Means State Profiling")

	# Code Example: K-Means Clustering
	with st.expander("Code Example: K-Means Implementation"):
	st.code("""
	# Perform K-means clustering
	from sklearn.cluster import KMeans

	# Create and fit the model
	kmeans = KMeans(
	n_clusters=4, # Number of clusters
	random_state=42, # For reproducibility
	n_init=20 # Number of times to run with different centroids
	)
	cluster_labels = kmeans.fit_predict(df_scaled)

	# Add cluster labels to original data
	df_clustered = df.copy()
	df_clustered['Cluster'] = cluster_labels

	# Visualize the clusters
	import plotly.express as px
	fig = px.scatter(df_clustered,
	x='Murder',
	y='Assault',
	color='Cluster',
	hover_data=['UrbanPop', 'Rape'],
	title='State Crime Profiles')
	fig.show()

	# Show cluster centers
	centers_df = pd.DataFrame(
	kmeans.cluster_centers_,
	columns=df.columns
	)
	print("Cluster Centers:")
	print(centers_df)
	""", language="python")

	st.markdown("""
	### What is K-Means Clustering?

	K-means is an unsupervised learning algorithm that groups similar data points together. Think of it like organizing
	students into study groups based on their interests:

	1. Initialization:
	- We randomly place K "centers" (centroids) in our data space
	- Each center represents the "average" of its cluster
	- In our case, each center represents a typical crime profile

	2. Assignment:
	- Each state is assigned to its nearest center
	- "Nearest" is measured by Euclidean distance
	- States with similar crime patterns end up in the same cluster

	3. Update:
	- Centers move to the average position of their assigned states
	- This process repeats until centers stop moving
	- The algorithm converges when states are optimally grouped
	""")

	# Visualize the process
	st.subheader("K-Means in Action")
	st.write("""
	Let's see how K-means works with our state crime data. We'll use K=4 clusters to find distinct crime profiles.
	""")

	# Let user choose number of clusters
	k = st.slider("Choose number of clusters (K)", 2, 6, 4)

	# Perform K-means clustering
	kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
	cluster_labels = kmeans.fit_predict(df_scaled)

	# Add cluster labels to original data
	df_clustered = df.copy()
	df_clustered['Cluster'] = cluster_labels

	# Create interactive scatter plot
	fig = px.scatter(df_clustered,
	x='Murder',
	y='Assault',
	color='Cluster',
	hover_data=['UrbanPop', 'Rape'],
	title='State Crime Profiles')
	st.plotly_chart(fig)

	# Explain hyperparameters
	st.markdown("""
	### K-Means Hyperparameters Explained

	1. n_clusters (K)
	- The number of groups we want to create
	- We chose K=4 based on the elbow method
	- Each cluster represents a distinct crime profile

	2. random_state
	- Controls the random initialization of centroids
	- Setting it to 42 ensures reproducible results
	- Different values might give slightly different clusters

	3. n_init
	- Number of times to run the algorithm with different initial centroids
	- We use 20 to find the best possible clustering
	- Higher values give more reliable results but take longer

	4. max_iter
	- Maximum number of iterations for each run
	- Default is 300, which is usually enough
	- Algorithm stops earlier if it converges

	5. algorithm
	- 'auto': Automatically chooses the best algorithm
	- 'full': Traditional K-means
	- 'elkan': More efficient for well-separated clusters
	""")

	# Show cluster centers
	st.subheader("Cluster Centers (Typical Crime Profiles)")
	centers_df = pd.DataFrame(
	kmeans.cluster_centers_,
	columns=df.columns
	)
	st.dataframe(centers_df)

	st.write("""
	Each row represents the "average" crime profile for that cluster. For example:
	- High values in Murder and Assault indicate a high-crime cluster
	- High UrbanPop with low crime rates might indicate urban safety
	- Low values across all metrics might indicate rural safety
	""")

	# Display cluster analysis
	st.subheader("State Crime Profiles Analysis")

	for cluster_num in range(k):
	cluster_states = df_clustered[df_clustered['Cluster'] == cluster_num]
	st.write(f"\nCLUSTER {cluster_num}: {len(cluster_states)} states")
	st.write("States:", ", ".join(cluster_states.index.tolist()))
	st.write("Average characteristics:")
	avg_profile = cluster_states[['Murder', 'Assault', 'UrbanPop', 'Rape']].mean()
	st.write(avg_profile)

	# Explain the results
	st.markdown("""
	### Interpreting the Results

	Each cluster represents a distinct crime profile:
	1. Cluster Characteristics
	- Look at the average values for each crime type
	- Compare urban population percentages
	- Identify the defining features of each cluster

	2. State Groupings
	- States in the same cluster have similar crime patterns
	- Geographic proximity doesn't always mean similar profiles
	- Some states might surprise you with their cluster membership

	3. Policy Implications
	- Clusters help identify states with similar challenges
	- Can guide resource allocation and policy development
	- Enables targeted interventions based on crime profiles
	""")

	# Exercise 5: Hierarchical Clustering
	st.header("Exercise 5: Hierarchical Clustering Exploration")

	# Code Example: Hierarchical Clustering
	with st.expander("Code Example: Hierarchical Clustering"):
	st.code("""
	# Create hierarchical clustering
	from scipy.cluster.hierarchy import linkage, dendrogram

	# Create linkage matrix
	linkage_matrix = linkage(df_scaled, method='complete')

	# Plot dendrogram
	import plotly.graph_objects as go
	dendro = dendrogram(linkage_matrix, labels=df.index.tolist(), no_plot=True)

	fig = go.Figure()
	fig.add_trace(go.Scatter(
	x=dendro['icoord'],
	y=dendro['dcoord'],
	mode='lines',
	line=dict(color='white')
	))
	fig.update_layout(
	title='State Crime Pattern Family Tree',
	xaxis_title='States',
	yaxis_title='Distance Between Groups'
	)
	fig.show()

	# Cut the tree to get clusters
	from scipy.cluster.hierarchy import fcluster
	hierarchical_labels = fcluster(linkage_matrix, k, criterion='maxclust') - 1
	""", language="python")

	st.markdown("""
	### What is Hierarchical Clustering?

	Hierarchical clustering creates a tree-like structure (dendrogram) that shows how data points are related at different levels.
	Think of it like building a family tree for states based on their crime patterns:

	1. Bottom-Up Approach (Agglomerative):
	- Start with each state as its own cluster
	- Find the two closest states and merge them
	- Continue merging until all states are in one cluster
	- Creates a complete hierarchy of relationships

	2. Distance Measurement:
	- Complete Linkage: Uses the maximum distance between states
	- Average Linkage: Uses the average distance between states
	- Single Linkage: Uses the minimum distance between states
	- We use complete linkage for more distinct clusters
	""")

	# Create hierarchical clustering
	linkage_matrix = linkage(df_scaled, method='complete')

	# Create interactive dendrogram
	fig_dendro = go.Figure()
	dendro = dendrogram(linkage_matrix, labels=df.index.tolist(), no_plot=True)

	fig_dendro.add_trace(go.Scatter(
	x=dendro['icoord'],
	y=dendro['dcoord'],
	mode='lines',
	line=dict(color='white')
	))

	fig_dendro.update_layout(
	title='State Crime Pattern Family Tree',
	xaxis_title='States',
	yaxis_title='Distance Between Groups',
	plot_bgcolor='rgb(30, 30, 30)',
	paper_bgcolor='rgb(30, 30, 30)',
	font=dict(color='white')
	)
	st.plotly_chart(fig_dendro)

	# Explain how to read the dendrogram
	st.markdown("""
	### How to Read the Dendrogram

	1. Height of Connections:
	- Higher connections = more different groups
	- Lower connections = more similar groups
	- The height shows how different two groups are

	2. Cutting the Tree:
	- Draw a horizontal line to create clusters
	- Where you cut determines the number of clusters
	- We'll cut at a height that gives us 4 clusters (like K-means)
	""")

	# Cut the tree to get clusters
	hierarchical_labels = fcluster(linkage_matrix, k, criterion='maxclust') - 1

	# Compare K-means and Hierarchical Clustering
	st.header("Comparing K-Means and Hierarchical Clustering")

	# Create side-by-side comparison
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("K-Means Clustering")
	fig_kmeans = px.scatter(df_clustered,
	x='Murder',
	y='Assault',
	color='Cluster',
	title='K-Means Clustering (K=4)',
	hover_data=['UrbanPop', 'Rape'])
	st.plotly_chart(fig_kmeans)

	st.markdown("""
	K-Means Characteristics:
	- Requires specifying number of clusters upfront
	- Creates clusters of similar size
	- Works well with spherical clusters
	- Faster for large datasets
	- Can be sensitive to outliers
	""")

	with col2:
	st.subheader("Hierarchical Clustering")
	df_hierarchical = df.copy()
	df_hierarchical['Cluster'] = hierarchical_labels
	fig_hierarchical = px.scatter(df_hierarchical,
	x='Murder',
	y='Assault',
	color='Cluster',
	title='Hierarchical Clustering (4 clusters)',
	hover_data=['UrbanPop', 'Rape'])
	st.plotly_chart(fig_hierarchical)

	st.markdown("""
	Hierarchical Clustering Characteristics:
	- Creates a complete hierarchy of clusters
	- Can handle non-spherical clusters
	- More flexible in cluster shapes
	- Slower for large datasets
	- Less sensitive to outliers
	""")

	# Show agreement between methods
	st.subheader("Comparing the Results")

	# Create comparison dataframe
	comparison_df = pd.DataFrame({
	'State': df.index,
	'K-Means Cluster': cluster_labels,
	'Hierarchical Cluster': hierarchical_labels
	})

	# Count agreements
	agreements = sum(comparison_df['K-Means Cluster'] == comparison_df['Hierarchical Cluster'])
	agreement_percentage = (agreements / len(comparison_df)) * 100

	st.write(f"Methods agreed on {agreements} out of {len(comparison_df)} states ({agreement_percentage:.1f}%)")

	# Show states where methods disagree
	disagreements = comparison_df[comparison_df['K-Means Cluster'] != comparison_df['Hierarchical Cluster']]
	if not disagreements.empty:
	st.write("States where the methods disagreed:")
	st.dataframe(disagreements)

	st.markdown("""
	### When to Use Each Method

	1. Use K-Means when:
	- You know the number of clusters
	- Your data has spherical clusters
	- You need fast computation
	- You want clusters of similar size

	2. Use Hierarchical Clustering when:
	- You don't know the number of clusters
	- You want to explore the hierarchy
	- Your clusters might be non-spherical
	- You need to handle outliers carefully

	In our case, both methods found similar patterns, suggesting our clusters are robust!
	""")

	# Exercise 6: Policy Brief
	st.header("Exercise 6: Policy Brief Creation")

	# Code Example: Creating Final Visualizations
	with st.expander("Code Example: Creating Policy Brief Visualizations"):
	st.code("""
	# Create a comprehensive visualization
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots

	# Create subplots
	fig = make_subplots(rows=2, cols=2)

	# Plot 1: Murder vs Assault by cluster
	for i in range(k):
	cluster_data = df_clustered[df_clustered['Cluster'] == i]
	fig.add_trace(
	go.Scatter(
	x=cluster_data['Murder'],
	y=cluster_data['Assault'],
	mode='markers',
	name=f'Cluster {i}'
	),
	row=1, col=1
	)

	# Plot 2: Urban Population vs Rape by cluster
	for i in range(k):
	cluster_data = df_clustered[df_clustered['Cluster'] == i]
	fig.add_trace(
	go.Scatter(
	x=cluster_data['UrbanPop'],
	y=cluster_data['Rape'],
	mode='markers',
	name=f'Cluster {i}'
	),
	row=1, col=2
	)

	# Update layout
	fig.update_layout(
	title_text="State Crime Profile Analysis",
	showlegend=True
	)
	fig.show()
	""", language="python")

	st.write("""
	Based on our analysis, here's a summary of findings and recommendations:

	Key Findings:
	- We identified distinct crime profiles among US states
	- Each cluster represents a unique pattern of crime rates and urban population
	- Some states show surprising similarities despite geographic distance

	Policy Recommendations:
	1. High-Priority States: Focus on states in high-crime clusters
	2. Resource Allocation: Distribute federal crime prevention funds based on cluster profiles
	3. Best Practice Sharing: Encourage states within the same cluster to share successful strategies
	""")

	# Additional Resources
	st.header("Additional Resources")
	st.write("""
	- [Scikit-learn Clustering Documentation](https://scikit-learn.org/stable/modules/clustering.html)
	- [KNN Documentation](https://scikit-learn.org/stable/modules/neighbors.html)
	""")