Spaces:
Sleeping
Sleeping
File size: 3,310 Bytes
41ceb78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# modules/profiling.py
# -*- coding: utf-8 -*-
#
# PROJECT: CognitiveEDA v5.7 - The QuantumLeap Intelligence Platform
#
# DESCRIPTION: A dedicated module for profiling and characterizing customer
# segments identified through clustering.
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import logging
def profile_clusters(df: pd.DataFrame, cluster_labels: pd.Series, numeric_cols: list, cat_cols: list) -> tuple:
"""
Analyzes and profiles clusters to create meaningful business personas.
This function groups the data by cluster and calculates key statistics
for numeric and categorical features to describe each segment. It then
visualizes these differences.
Args:
df (pd.DataFrame): The feature-engineered DataFrame.
cluster_labels (pd.Series): The series of cluster labels from the K-Means model.
numeric_cols (list): List of numeric columns to profile (e.g., ['Total_Revenue']).
cat_cols (list): List of categorical columns to profile (e.g., ['City', 'Product']).
Returns:
A tuple containing:
- A markdown string with the detailed profile of each cluster.
- A Plotly Figure visualizing the differences between clusters.
"""
if df.empty or cluster_labels.empty:
return "No data to profile.", go.Figure()
logging.info(f"Profiling {cluster_labels.nunique()} clusters...")
profile_df = df.copy()
profile_df['Cluster'] = cluster_labels
# --- Generate Markdown Report ---
report_md = "### Cluster Persona Analysis\n\n"
# Analyze numeric features by cluster
numeric_profile = profile_df.groupby('Cluster')[numeric_cols].mean().round(2)
# Analyze categorical features by cluster (get the most frequent value - mode)
cat_profile_list = []
for col in cat_cols:
mode_series = profile_df.groupby('Cluster')[col].apply(lambda x: x.mode().iloc[0])
mode_df = mode_series.to_frame()
cat_profile_list.append(mode_df)
full_profile = pd.concat([numeric_profile] + cat_profile_list, axis=1)
for cluster_id in sorted(profile_df['Cluster'].unique()):
report_md += f"#### Cluster {cluster_id}: The '{full_profile.loc[cluster_id, 'City']}' Persona\n"
# Numeric Summary
for col in numeric_cols:
val = full_profile.loc[cluster_id, col]
report_md += f"- **Avg. {col.replace('_', ' ')}:** `{val:,.2f}`\n"
# Categorical Summary
for col in cat_cols:
val = full_profile.loc[cluster_id, col]
report_md += f"- **Dominant {col}:** `{val}`\n"
report_md += "\n"
# --- Generate Visualization ---
# We'll visualize the average 'Total_Revenue' by 'City' for each cluster
# This directly tests our hypothesis that 'City' is the dominant feature.
vis_df = profile_df.groupby(['Cluster', 'City'])['Total_Revenue'].mean().reset_index()
fig = px.bar(
vis_df,
x='Cluster',
y='Total_Revenue',
color='City',
barmode='group',
title='<b>Cluster Profile: Avg. Total Revenue by City</b>',
labels={'Total_Revenue': 'Average Total Revenue ($)', 'Cluster': 'Customer Segment'}
)
return report_md, fig |