Spaces:

mgbam
/

CognitiveEDA

Sleeping

File size: 3,310 Bytes

41ceb78

# modules/profiling.py

# -*- coding: utf-8 -*-
#
# PROJECT:      CognitiveEDA v5.7 - The QuantumLeap Intelligence Platform
#
# DESCRIPTION:  A dedicated module for profiling and characterizing customer
#               segments identified through clustering.

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import logging

def profile_clusters(df: pd.DataFrame, cluster_labels: pd.Series, numeric_cols: list, cat_cols: list) -> tuple:
    """
    Analyzes and profiles clusters to create meaningful business personas.

    This function groups the data by cluster and calculates key statistics
    for numeric and categorical features to describe each segment. It then
    visualizes these differences.

    Args:
        df (pd.DataFrame): The feature-engineered DataFrame.
        cluster_labels (pd.Series): The series of cluster labels from the K-Means model.
        numeric_cols (list): List of numeric columns to profile (e.g., ['Total_Revenue']).
        cat_cols (list): List of categorical columns to profile (e.g., ['City', 'Product']).

    Returns:
        A tuple containing:
        - A markdown string with the detailed profile of each cluster.
        - A Plotly Figure visualizing the differences between clusters.
    """
    if df.empty or cluster_labels.empty:
        return "No data to profile.", go.Figure()

    logging.info(f"Profiling {cluster_labels.nunique()} clusters...")
    
    profile_df = df.copy()
    profile_df['Cluster'] = cluster_labels

    # --- Generate Markdown Report ---
    report_md = "### Cluster Persona Analysis\n\n"
    
    # Analyze numeric features by cluster
    numeric_profile = profile_df.groupby('Cluster')[numeric_cols].mean().round(2)
    
    # Analyze categorical features by cluster (get the most frequent value - mode)
    cat_profile_list = []
    for col in cat_cols:
        mode_series = profile_df.groupby('Cluster')[col].apply(lambda x: x.mode().iloc[0])
        mode_df = mode_series.to_frame()
        cat_profile_list.append(mode_df)
    
    full_profile = pd.concat([numeric_profile] + cat_profile_list, axis=1)

    for cluster_id in sorted(profile_df['Cluster'].unique()):
        report_md += f"#### Cluster {cluster_id}: The '{full_profile.loc[cluster_id, 'City']}' Persona\n"
        
        # Numeric Summary
        for col in numeric_cols:
            val = full_profile.loc[cluster_id, col]
            report_md += f"- **Avg. {col.replace('_', ' ')}:** `{val:,.2f}`\n"
        
        # Categorical Summary
        for col in cat_cols:
            val = full_profile.loc[cluster_id, col]
            report_md += f"- **Dominant {col}:** `{val}`\n"
        report_md += "\n"

    # --- Generate Visualization ---
    # We'll visualize the average 'Total_Revenue' by 'City' for each cluster
    # This directly tests our hypothesis that 'City' is the dominant feature.
    vis_df = profile_df.groupby(['Cluster', 'City'])['Total_Revenue'].mean().reset_index()

    fig = px.bar(
        vis_df,
        x='Cluster',
        y='Total_Revenue',
        color='City',
        barmode='group',
        title='<b>Cluster Profile: Avg. Total Revenue by City</b>',
        labels={'Total_Revenue': 'Average Total Revenue ($)', 'Cluster': 'Customer Segment'}
    )
    
    return report_md, fig