Spaces:
Running
Running
File size: 36,778 Bytes
e3cbb18 a11780d e3cbb18 020aa7b e3cbb18 56e12df e3cbb18 56e12df a11780d 56e12df 58eb0f9 56e12df e3cbb18 56e12df e3cbb18 56e12df e3cbb18 56e12df a11780d e3cbb18 a11780d e3cbb18 a11780d e3cbb18 a11780d e3cbb18 a11780d e3cbb18 a11780d e3cbb18 a11780d e3cbb18 a11780d 58eb0f9 a11780d 58eb0f9 e3cbb18 58eb0f9 e3cbb18 58eb0f9 e3cbb18 58eb0f9 56e12df 58eb0f9 e3cbb18 a11780d e3cbb18 a11780d e3cbb18 a11780d e3cbb18 a11780d e3cbb18 a11780d 020aa7b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 |
import pandas as pd
from datetime import datetime, timedelta, time
import logging
import numpy as np
# Configure logging for this module
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(module)s - %(message)s')
def filter_dataframe_by_date(df, date_column, start_date, end_date):
"""Filters a DataFrame by a date column within a given date range."""
if df is None or df.empty or not date_column:
logging.warning(f"Filter by date: DataFrame is None, empty, or no date_column provided. DF: {df is not None}, empty: {df.empty if df is not None else 'N/A'}, date_column: {date_column}")
return pd.DataFrame()
if date_column not in df.columns:
logging.warning(f"Filter by date: Date column '{date_column}' not found in DataFrame columns: {df.columns.tolist()}.")
return pd.DataFrame()
df_copy = df.copy() # Work on a copy to avoid SettingWithCopyWarning
try:
# Ensure the date column is pandas datetime objects
if not pd.api.types.is_datetime64_any_dtype(df_copy[date_column]):
df_copy[date_column] = pd.to_datetime(df_copy[date_column], errors='coerce')
# Drop rows where date conversion might have failed (NaT) or was originally NaT
df_copy.dropna(subset=[date_column], inplace=True)
if df_copy.empty:
logging.info(f"Filter by date: DataFrame empty after to_datetime and dropna for column '{date_column}'.")
return pd.DataFrame()
# Normalize to midnight. This preserves timezone information if present.
df_copy[date_column] = df_copy[date_column].dt.normalize()
# If the column is timezone-aware, convert its values to naive UTC equivalent.
# This allows comparison with naive filter dates.
if hasattr(df_copy[date_column].dt, 'tz') and df_copy[date_column].dt.tz is not None:
logging.info(f"Column '{date_column}' is timezone-aware ({df_copy[date_column].dt.tz}). Converting to naive (from UTC) for comparison.")
df_copy[date_column] = df_copy[date_column].dt.tz_convert('UTC').dt.tz_localize(None)
except Exception as e:
logging.error(f"Error processing date column '{date_column}': {e}", exc_info=True)
return pd.DataFrame()
# Convert start_date and end_date (which are naive Python datetime or naive Pandas Timestamp)
# to naive pandas Timestamps and normalize them.
start_dt_obj = pd.to_datetime(start_date, errors='coerce').normalize() if start_date else None
end_dt_obj = pd.to_datetime(end_date, errors='coerce').normalize() if end_date else None
# Perform the filtering
# df_filtered is already df_copy which has NaNs dropped and dates processed
if start_dt_obj and end_dt_obj:
df_filtered_final = df_copy[(df_copy[date_column] >= start_dt_obj) & (df_copy[date_column] <= end_dt_obj)]
elif start_dt_obj:
df_filtered_final = df_copy[df_copy[date_column] >= start_dt_obj]
elif end_dt_obj:
df_filtered_final = df_copy[df_copy[date_column] <= end_dt_obj]
else:
df_filtered_final = df_copy # No date filtering if neither start_date nor end_date is provided
if df_filtered_final.empty:
logging.info(f"Filter by date: DataFrame became empty after applying date range to column '{date_column}'.")
return df_filtered_final
def prepare_filtered_analytics_data(token_state_value, date_filter_option, custom_start_date, custom_end_date):
"""
Retrieves data from token_state, determines date range, filters posts, mentions, and follower time-series data.
Merges posts with post stats.
Returns:
- filtered_merged_posts_df: Posts merged with stats, filtered by date.
- filtered_mentions_df: Mentions filtered by date.
- date_filtered_follower_stats_df: Follower stats filtered by date (for time-series plots).
- raw_follower_stats_df: Unfiltered follower stats (for demographic plots).
- start_dt_filter: Determined start date for filtering.
- end_dt_filter: Determined end date for filtering.
"""
logging.info(f"Preparing filtered analytics data. Filter: {date_filter_option}, Custom Start: {custom_start_date}, Custom End: {custom_end_date}")
posts_df = token_state_value.get("bubble_posts_df", pd.DataFrame()).copy()
mentions_df = token_state_value.get("bubble_mentions_df", pd.DataFrame()).copy()
follower_stats_df = token_state_value.get("bubble_follower_stats_df", pd.DataFrame()).copy()
post_stats_df = token_state_value.get("bubble_post_stats_df", pd.DataFrame()).copy() # Fetch post_stats_df
date_column_posts = token_state_value.get("config_date_col_posts", "published_at")
date_column_mentions = token_state_value.get("config_date_col_mentions", "date")
# Assuming follower_stats_df has a 'date' column for time-series data
date_column_followers = token_state_value.get("config_date_col_followers", "date")
# Determine date range for filtering
current_datetime_obj = datetime.now()
current_time_normalized = current_datetime_obj.replace(hour=0, minute=0, second=0, microsecond=0)
end_dt_filter = current_time_normalized
start_dt_filter = None
if date_filter_option == "Last 7 Days":
start_dt_filter = current_time_normalized - timedelta(days=6)
elif date_filter_option == "Last 30 Days":
start_dt_filter = current_time_normalized - timedelta(days=29)
elif date_filter_option == "Custom Range":
start_dt_filter_temp = pd.to_datetime(custom_start_date, errors='coerce')
start_dt_filter = start_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(start_dt_filter_temp) else None
end_dt_filter_temp = pd.to_datetime(custom_end_date, errors='coerce')
end_dt_filter = end_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(end_dt_filter_temp) else current_time_normalized
logging.info(f"Date range for filtering: Start: {start_dt_filter}, End: {end_dt_filter}")
# Merge posts_df and post_stats_df
merged_posts_df = pd.DataFrame()
if not posts_df.empty and not post_stats_df.empty:
# Assuming posts_df has 'id' and post_stats_df has 'post_id' for merging
if 'id' in posts_df.columns and 'post_id' in post_stats_df.columns:
merged_posts_df = pd.merge(posts_df, post_stats_df, left_on='id', right_on='post_id', how='left')
logging.info(f"Merged posts_df ({len(posts_df)} rows) and post_stats_df ({len(post_stats_df)} rows) into merged_posts_df ({len(merged_posts_df)} rows).")
else:
logging.warning("Cannot merge posts_df and post_stats_df due to missing 'id' or 'post_id' columns.")
# Fallback to using posts_df if merge fails but provide an empty df for stats-dependent plots
merged_posts_df = posts_df # Or handle as an error / empty DF for those plots
elif not posts_df.empty:
logging.warning("post_stats_df is empty. Proceeding with posts_df only for plots that don't require stats.")
merged_posts_df = posts_df # Create necessary columns with NaN if they are expected by plots
# For columns expected from post_stats_df, add them with NaNs if not present
expected_stat_cols = ['engagement', 'impressionCount', 'clickCount', 'likeCount', 'commentCount', 'shareCount']
for col in expected_stat_cols:
if col not in merged_posts_df.columns:
merged_posts_df[col] = pd.NA
# Filter DataFrames by date
filtered_merged_posts_data = pd.DataFrame()
if not merged_posts_df.empty and date_column_posts in merged_posts_df.columns:
filtered_merged_posts_data = filter_dataframe_by_date(merged_posts_df, date_column_posts, start_dt_filter, end_dt_filter)
elif not merged_posts_df.empty:
logging.warning(f"Date column '{date_column_posts}' not found in merged_posts_df. Returning unfiltered merged posts data.")
filtered_merged_posts_data = merged_posts_df # Or apply other logic
filtered_mentions_data = pd.DataFrame()
if not mentions_df.empty and date_column_mentions in mentions_df.columns:
filtered_mentions_data = filter_dataframe_by_date(mentions_df, date_column_mentions, start_dt_filter, end_dt_filter)
elif not mentions_df.empty:
logging.warning(f"Date column '{date_column_mentions}' not found in mentions_df. Returning unfiltered mentions data.")
filtered_mentions_data = mentions_df
date_filtered_follower_stats_df = pd.DataFrame()
raw_follower_stats_df = follower_stats_df.copy() # For demographic plots, use raw (or latest snapshot logic)
if not follower_stats_df.empty and date_column_followers in follower_stats_df.columns:
date_filtered_follower_stats_df = filter_dataframe_by_date(follower_stats_df, date_column_followers, start_dt_filter, end_dt_filter)
elif not follower_stats_df.empty:
logging.warning(f"Date column '{date_column_followers}' not found in follower_stats_df. Time-series follower plots might be empty or use unfiltered data.")
# Decide if date_filtered_follower_stats_df should be raw_follower_stats_df or empty
date_filtered_follower_stats_df = follower_stats_df # Or pd.DataFrame() if strict filtering is required
logging.info(f"Processed - Filtered Merged Posts: {len(filtered_merged_posts_data)} rows, Filtered Mentions: {len(filtered_mentions_data)} rows, Date-Filtered Follower Stats: {len(date_filtered_follower_stats_df)} rows.")
return filtered_merged_posts_data, filtered_mentions_data, date_filtered_follower_stats_df, raw_follower_stats_df, start_dt_filter, end_dt_filter
# --- Helper function to generate textual data summaries for chatbot ---
def generate_chatbot_data_summaries(
plot_configs_list,
filtered_merged_posts_df,
filtered_mentions_df,
date_filtered_follower_stats_df, # Expected to contain 'follower_gains_monthly'
raw_follower_stats_df, # Expected to contain other demographics like 'follower_geo', 'follower_industry'
token_state_value
):
"""
Generates textual summaries for each plot ID to be used by the chatbot,
based on the corrected understanding of DataFrame structures and follower count columns.
"""
data_summaries = {}
# --- Date and Config Columns from token_state ---
# For Posts
date_col_posts = token_state_value.get("config_date_col_posts", "published_at")
media_type_col_name = token_state_value.get("config_media_type_col", "media_type")
eb_labels_col_name = token_state_value.get("config_eb_labels_col", "li_eb_label")
# For Mentions
date_col_mentions = token_state_value.get("config_date_col_mentions", "date")
mentions_sentiment_col = "sentiment_label" # As per user's mention df structure
# For Follower Stats - Actual column names provided by user
follower_count_organic_col = "follower_count_organic"
follower_count_paid_col = "follower_count_paid"
# For Follower Stats (Demographics from raw_follower_stats_df)
follower_demographics_type_col = "follower_count_type" # Column indicating 'follower_geo', 'follower_industry'
follower_demographics_category_col = "category_name" # Column indicating 'USA', 'Technology'
# For Follower Gains/Growth (from date_filtered_follower_stats_df)
follower_gains_type_col = "follower_count_type" # Should be 'follower_gains_monthly'
follower_gains_date_col = "category_name" # This is 'YYYY-MM-DD'
# --- Helper: Safely convert to datetime ---
def safe_to_datetime(series, errors='coerce'):
return pd.to_datetime(series, errors=errors)
# --- Prepare DataFrames (copy and convert dates) ---
if filtered_merged_posts_df is not None and not filtered_merged_posts_df.empty:
posts_df = filtered_merged_posts_df.copy()
if date_col_posts in posts_df.columns:
posts_df[date_col_posts] = safe_to_datetime(posts_df[date_col_posts])
else:
logging.warning(f"Date column '{date_col_posts}' not found in posts_df for chatbot summary.")
else:
posts_df = pd.DataFrame()
if filtered_mentions_df is not None and not filtered_mentions_df.empty:
mentions_df = filtered_mentions_df.copy()
if date_col_mentions in mentions_df.columns:
mentions_df[date_col_mentions] = safe_to_datetime(mentions_df[date_col_mentions])
else:
logging.warning(f"Date column '{date_col_mentions}' not found in mentions_df for chatbot summary.")
else:
mentions_df = pd.DataFrame()
# For date_filtered_follower_stats_df (monthly gains)
if date_filtered_follower_stats_df is not None and not date_filtered_follower_stats_df.empty:
follower_monthly_df = date_filtered_follower_stats_df.copy()
if follower_gains_type_col in follower_monthly_df.columns:
follower_monthly_df = follower_monthly_df[follower_monthly_df[follower_gains_type_col] == 'follower_gains_monthly'].copy()
if follower_gains_date_col in follower_monthly_df.columns:
follower_monthly_df['datetime_obj'] = safe_to_datetime(follower_monthly_df[follower_gains_date_col])
follower_monthly_df = follower_monthly_df.dropna(subset=['datetime_obj'])
# Calculate total gains
if follower_count_organic_col in follower_monthly_df.columns and follower_count_paid_col in follower_monthly_df.columns:
follower_monthly_df[follower_count_organic_col] = pd.to_numeric(follower_monthly_df[follower_count_organic_col], errors='coerce').fillna(0)
follower_monthly_df[follower_count_paid_col] = pd.to_numeric(follower_monthly_df[follower_count_paid_col], errors='coerce').fillna(0)
follower_monthly_df['total_monthly_gains'] = follower_monthly_df[follower_count_organic_col] + follower_monthly_df[follower_count_paid_col]
elif follower_count_organic_col in follower_monthly_df.columns: # Only organic exists
follower_monthly_df[follower_count_organic_col] = pd.to_numeric(follower_monthly_df[follower_count_organic_col], errors='coerce').fillna(0)
follower_monthly_df['total_monthly_gains'] = follower_monthly_df[follower_count_organic_col]
elif follower_count_paid_col in follower_monthly_df.columns: # Only paid exists
follower_monthly_df[follower_count_paid_col] = pd.to_numeric(follower_monthly_df[follower_count_paid_col], errors='coerce').fillna(0)
follower_monthly_df['total_monthly_gains'] = follower_monthly_df[follower_count_paid_col]
else:
logging.warning(f"Neither '{follower_count_organic_col}' nor '{follower_count_paid_col}' found in follower_monthly_df for total gains calculation.")
follower_monthly_df['total_monthly_gains'] = 0 # Avoid KeyError later
else:
logging.warning(f"Date column '{follower_gains_date_col}' (from category_name) not found in follower_monthly_df for chatbot summary.")
if 'datetime_obj' not in follower_monthly_df.columns:
follower_monthly_df['datetime_obj'] = pd.NaT
if 'total_monthly_gains' not in follower_monthly_df.columns:
follower_monthly_df['total_monthly_gains'] = 0
else:
follower_monthly_df = pd.DataFrame(columns=[follower_gains_date_col, 'total_monthly_gains', 'datetime_obj'])
if raw_follower_stats_df is not None and not raw_follower_stats_df.empty:
follower_demographics_df = raw_follower_stats_df.copy()
# Calculate total followers for demographics
if follower_count_organic_col in follower_demographics_df.columns and follower_count_paid_col in follower_demographics_df.columns:
follower_demographics_df[follower_count_organic_col] = pd.to_numeric(follower_demographics_df[follower_count_organic_col], errors='coerce').fillna(0)
follower_demographics_df[follower_count_paid_col] = pd.to_numeric(follower_demographics_df[follower_count_paid_col], errors='coerce').fillna(0)
follower_demographics_df['total_follower_count'] = follower_demographics_df[follower_count_organic_col] + follower_demographics_df[follower_count_paid_col]
elif follower_count_organic_col in follower_demographics_df.columns:
follower_demographics_df[follower_count_organic_col] = pd.to_numeric(follower_demographics_df[follower_count_organic_col], errors='coerce').fillna(0)
follower_demographics_df['total_follower_count'] = follower_demographics_df[follower_count_organic_col]
elif follower_count_paid_col in follower_demographics_df.columns:
follower_demographics_df[follower_count_paid_col] = pd.to_numeric(follower_demographics_df[follower_count_paid_col], errors='coerce').fillna(0)
follower_demographics_df['total_follower_count'] = follower_demographics_df[follower_count_paid_col]
else:
logging.warning(f"Neither '{follower_count_organic_col}' nor '{follower_count_paid_col}' found in follower_demographics_df for total count calculation.")
if 'total_follower_count' not in follower_demographics_df.columns:
follower_demographics_df['total_follower_count'] = 0
else:
follower_demographics_df = pd.DataFrame()
for plot_cfg in plot_configs_list:
plot_id = plot_cfg["id"]
plot_label = plot_cfg["label"]
summary_text = f"No specific data summary available for '{plot_label}' for the selected period."
try:
# --- FOLLOWER STATS ---
if plot_id == "followers_count": # Uses follower_monthly_df
if not follower_monthly_df.empty and 'total_monthly_gains' in follower_monthly_df.columns and 'datetime_obj' in follower_monthly_df.columns and not follower_monthly_df['datetime_obj'].isnull().all():
df_summary = follower_monthly_df[['datetime_obj', 'total_monthly_gains']].copy()
df_summary['datetime_obj'] = df_summary['datetime_obj'].dt.strftime('%Y-%m-%d')
df_summary.rename(columns={'datetime_obj': 'Date', 'total_monthly_gains': 'Total Monthly Gains'}, inplace=True)
summary_text = f"Follower Count (Total Monthly Gains):\n{df_summary.sort_values(by='Date').tail(5).to_string(index=False)}"
else:
summary_text = f"Follower count data (total monthly gains) is unavailable or incomplete for '{plot_label}'."
elif plot_id == "followers_growth_rate": # Uses follower_monthly_df
if not follower_monthly_df.empty and 'total_monthly_gains' in follower_monthly_df.columns and 'datetime_obj' in follower_monthly_df.columns and not follower_monthly_df['datetime_obj'].isnull().all():
df_calc = follower_monthly_df.sort_values(by='datetime_obj').copy()
# Growth rate is calculated on the total monthly gains (which are changes, not cumulative counts)
# To calculate growth rate of followers, we'd need cumulative follower count.
# The plot logic also uses pct_change on the gains themselves.
# If 'total_monthly_gains' represents the *change* in followers, then pct_change on this is rate of change of gains.
# If it represents the *cumulative* followers at that point, then pct_change is follower growth rate.
# Assuming 'total_monthly_gains' is the *change* for the month, like the plot logic.
df_calc['total_monthly_gains'] = pd.to_numeric(df_calc['total_monthly_gains'], errors='coerce')
if len(df_calc) >= 2:
# Calculate cumulative sum to get follower count if 'total_monthly_gains' are indeed just gains
# If your 'total_monthly_gains' already IS the total follower count at end of month, remove next line
# For now, assuming it's GAINS, so we need cumulative for growth rate of total followers.
# However, the original plot logic applies pct_change directly to 'follower_gains_monthly'.
# Let's stick to pct_change on the gains/count column for consistency with plot.
# If 'total_monthly_gains' is the actual follower count for that month:
df_calc['growth_rate_monthly'] = df_calc['total_monthly_gains'].pct_change() * 100
df_calc['growth_rate_monthly'] = df_calc['growth_rate_monthly'].round(2)
df_calc.replace([np.inf, -np.inf], np.nan, inplace=True) # Handle division by zero if a gain was 0
df_summary = df_calc[['datetime_obj', 'growth_rate_monthly']].dropna().copy()
df_summary['datetime_obj'] = df_summary['datetime_obj'].dt.strftime('%Y-%m-%d')
df_summary.rename(columns={'datetime_obj': 'Date', 'growth_rate_monthly': 'Growth Rate (%)'}, inplace=True)
if not df_summary.empty:
summary_text = f"Follower Growth Rate (Monthly % based on Total Follower Count/Gains):\n{df_summary.sort_values(by='Date').tail(5).to_string(index=False)}"
else:
summary_text = f"Not enough data points or valid transitions to calculate follower growth rate for '{plot_label}'."
else:
summary_text = f"Not enough data points (need at least 2) to calculate follower growth rate for '{plot_label}'."
else:
summary_text = f"Follower growth rate data (total monthly gains) is unavailable or incomplete for '{plot_label}'."
elif plot_id in ["followers_by_location", "followers_by_role", "followers_by_industry", "followers_by_seniority"]:
demographic_type_map = {
"followers_by_location": "follower_geo",
"followers_by_role": "follower_function",
"followers_by_industry": "follower_industry",
"followers_by_seniority": "follower_seniority"
}
current_demographic_type = demographic_type_map.get(plot_id)
if not follower_demographics_df.empty and \
follower_demographics_type_col in follower_demographics_df.columns and \
follower_demographics_category_col in follower_demographics_df.columns and \
'total_follower_count' in follower_demographics_df.columns: # Check for the calculated total
df_filtered_demographics = follower_demographics_df[
follower_demographics_df[follower_demographics_type_col] == current_demographic_type
].copy()
if not df_filtered_demographics.empty:
df_summary = df_filtered_demographics.groupby(follower_demographics_category_col)['total_follower_count'].sum().reset_index()
df_summary.rename(columns={follower_demographics_category_col: 'Category', 'total_follower_count': 'Total Follower Count'}, inplace=True)
top_5 = df_summary.nlargest(5, 'Total Follower Count')
summary_text = f"Top 5 {plot_label} (Total Followers):\n{top_5.to_string(index=False)}"
else:
summary_text = f"No data available for demographic type '{current_demographic_type}' in '{plot_label}'."
else:
summary_text = f"Follower demographic data columns (including total_follower_count) are missing or incomplete for '{plot_label}'."
# --- POSTS STATS ---
elif plot_id == "engagement_rate":
if not posts_df.empty and 'engagement' in posts_df.columns and date_col_posts in posts_df.columns and not posts_df[date_col_posts].isnull().all():
df_resampled = posts_df.set_index(date_col_posts)['engagement'].resample('W').mean().reset_index()
df_resampled['engagement'] = pd.to_numeric(df_resampled['engagement'], errors='coerce').round(2)
df_summary = df_resampled[[date_col_posts, 'engagement']].dropna().copy()
df_summary[date_col_posts] = df_summary[date_col_posts].dt.strftime('%Y-%m-%d')
summary_text = f"Engagement Rate Over Time (Weekly Avg %):\n{df_summary.sort_values(by=date_col_posts).tail(5).to_string(index=False)}"
else:
summary_text = f"Engagement rate data is unavailable for '{plot_label}'."
elif plot_id == "reach_over_time":
if not posts_df.empty and 'reach' in posts_df.columns and date_col_posts in posts_df.columns and not posts_df[date_col_posts].isnull().all():
df_resampled = posts_df.set_index(date_col_posts)['reach'].resample('W').sum().reset_index()
df_resampled['reach'] = pd.to_numeric(df_resampled['reach'], errors='coerce')
df_summary = df_resampled[[date_col_posts, 'reach']].dropna().copy()
df_summary[date_col_posts] = df_summary[date_col_posts].dt.strftime('%Y-%m-%d')
summary_text = f"Reach Over Time (Weekly Sum):\n{df_summary.sort_values(by=date_col_posts).tail(5).to_string(index=False)}"
else:
summary_text = f"Reach data is unavailable for '{plot_label}'."
elif plot_id == "impressions_over_time":
if not posts_df.empty and 'impressionCount' in posts_df.columns and date_col_posts in posts_df.columns and not posts_df[date_col_posts].isnull().all():
df_resampled = posts_df.set_index(date_col_posts)['impressionCount'].resample('W').sum().reset_index()
df_resampled['impressionCount'] = pd.to_numeric(df_resampled['impressionCount'], errors='coerce')
df_summary = df_resampled[[date_col_posts, 'impressionCount']].dropna().copy()
df_summary[date_col_posts] = df_summary[date_col_posts].dt.strftime('%Y-%m-%d')
df_summary.rename(columns={'impressionCount': 'Impressions'}, inplace=True)
summary_text = f"Impressions Over Time (Weekly Sum):\n{df_summary.sort_values(by=date_col_posts).tail(5).to_string(index=False)}"
else:
summary_text = f"Impressions data is unavailable for '{plot_label}'."
elif plot_id == "likes_over_time":
if not posts_df.empty and 'likeCount' in posts_df.columns and date_col_posts in posts_df.columns and not posts_df[date_col_posts].isnull().all():
df_resampled = posts_df.set_index(date_col_posts)['likeCount'].resample('W').sum().reset_index()
df_resampled['likeCount'] = pd.to_numeric(df_resampled['likeCount'], errors='coerce')
df_summary = df_resampled[[date_col_posts, 'likeCount']].dropna().copy()
df_summary[date_col_posts] = df_summary[date_col_posts].dt.strftime('%Y-%m-%d')
df_summary.rename(columns={'likeCount': 'Likes'}, inplace=True)
summary_text = f"Likes Over Time (Weekly Sum):\n{df_summary.sort_values(by=date_col_posts).tail(5).to_string(index=False)}"
else:
summary_text = f"Likes data is unavailable for '{plot_label}'."
elif plot_id == "clicks_over_time":
if not posts_df.empty and 'clickCount' in posts_df.columns and date_col_posts in posts_df.columns and not posts_df[date_col_posts].isnull().all():
df_resampled = posts_df.set_index(date_col_posts)['clickCount'].resample('W').sum().reset_index()
df_resampled['clickCount'] = pd.to_numeric(df_resampled['clickCount'], errors='coerce')
df_summary = df_resampled[[date_col_posts, 'clickCount']].dropna().copy()
df_summary[date_col_posts] = df_summary[date_col_posts].dt.strftime('%Y-%m-%d')
df_summary.rename(columns={'clickCount': 'Clicks'}, inplace=True)
summary_text = f"Clicks Over Time (Weekly Sum):\n{df_summary.sort_values(by=date_col_posts).tail(5).to_string(index=False)}"
else:
summary_text = f"Clicks data is unavailable for '{plot_label}'."
elif plot_id == "shares_over_time":
if not posts_df.empty and 'shareCount' in posts_df.columns and date_col_posts in posts_df.columns and not posts_df[date_col_posts].isnull().all():
df_resampled = posts_df.set_index(date_col_posts)['shareCount'].resample('W').sum().reset_index()
df_resampled['shareCount'] = pd.to_numeric(df_resampled['shareCount'], errors='coerce')
df_summary = df_resampled[[date_col_posts, 'shareCount']].dropna().copy()
df_summary[date_col_posts] = df_summary[date_col_posts].dt.strftime('%Y-%m-%d')
df_summary.rename(columns={'shareCount': 'Shares'}, inplace=True)
summary_text = f"Shares Over Time (Weekly Sum):\n{df_summary.sort_values(by=date_col_posts).tail(5).to_string(index=False)}"
elif 'shareCount' not in posts_df.columns and not posts_df.empty : # Check if posts_df is not empty before assuming column is the only issue
summary_text = f"Shares data column ('shareCount') not found for '{plot_label}'."
else:
summary_text = f"Shares data is unavailable for '{plot_label}'."
elif plot_id == "comments_over_time":
if not posts_df.empty and 'commentCount' in posts_df.columns and date_col_posts in posts_df.columns and not posts_df[date_col_posts].isnull().all():
df_resampled = posts_df.set_index(date_col_posts)['commentCount'].resample('W').sum().reset_index()
df_resampled['commentCount'] = pd.to_numeric(df_resampled['commentCount'], errors='coerce')
df_summary = df_resampled[[date_col_posts, 'commentCount']].dropna().copy()
df_summary[date_col_posts] = df_summary[date_col_posts].dt.strftime('%Y-%m-%d')
df_summary.rename(columns={'commentCount': 'Comments'}, inplace=True)
summary_text = f"Comments Over Time (Weekly Sum):\n{df_summary.sort_values(by=date_col_posts).tail(5).to_string(index=False)}"
else:
summary_text = f"Comments data is unavailable for '{plot_label}'."
elif plot_id == "comments_sentiment":
comment_sentiment_col_posts = "sentiment"
if not posts_df.empty and comment_sentiment_col_posts in posts_df.columns:
sentiment_counts = posts_df[comment_sentiment_col_posts].value_counts().reset_index()
sentiment_counts.columns = ['Sentiment', 'Count']
summary_text = f"Comments Sentiment Breakdown (Posts Data):\n{sentiment_counts.to_string(index=False)}"
else:
summary_text = f"Comment sentiment data ('{comment_sentiment_col_posts}') is unavailable for '{plot_label}'."
elif plot_id == "post_frequency_cs":
if not posts_df.empty and date_col_posts in posts_df.columns and not posts_df[date_col_posts].isnull().all():
post_counts_weekly = posts_df.set_index(date_col_posts).resample('W').size().reset_index(name='post_count')
post_counts_weekly.rename(columns={date_col_posts: 'Week', 'post_count': 'Posts'}, inplace=True)
post_counts_weekly['Week'] = post_counts_weekly['Week'].dt.strftime('%Y-%m-%d (Week of)')
summary_text = f"Post Frequency (Weekly):\n{post_counts_weekly.sort_values(by='Week').tail(5).to_string(index=False)}"
else:
summary_text = f"Post frequency data is unavailable for '{plot_label}'."
elif plot_id == "content_format_breakdown_cs":
if not posts_df.empty and media_type_col_name in posts_df.columns:
format_counts = posts_df[media_type_col_name].value_counts().reset_index()
format_counts.columns = ['Format', 'Count']
summary_text = f"Content Format Breakdown:\n{format_counts.nlargest(5, 'Count').to_string(index=False)}"
else:
summary_text = f"Content format data ('{media_type_col_name}') is unavailable for '{plot_label}'."
elif plot_id == "content_topic_breakdown_cs":
if not posts_df.empty and eb_labels_col_name in posts_df.columns:
try:
# Ensure the column is not all NaN before trying to check for lists or explode
if posts_df[eb_labels_col_name].notna().any():
if posts_df[eb_labels_col_name].apply(lambda x: isinstance(x, list)).any():
topic_counts = posts_df.explode(eb_labels_col_name)[eb_labels_col_name].value_counts().reset_index()
else:
topic_counts = posts_df[eb_labels_col_name].value_counts().reset_index()
topic_counts.columns = ['Topic', 'Count']
summary_text = f"Content Topic Breakdown (Top 5):\n{topic_counts.nlargest(5, 'Count').to_string(index=False)}"
else:
summary_text = f"Content topic data ('{eb_labels_col_name}') contains no valid topics for '{plot_label}'."
except Exception as e_topic:
logging.warning(f"Could not process topic breakdown for '{eb_labels_col_name}': {e_topic}")
summary_text = f"Content topic data ('{eb_labels_col_name}') could not be processed for '{plot_label}'."
else:
summary_text = f"Content topic data ('{eb_labels_col_name}') is unavailable for '{plot_label}'."
# --- MENTIONS STATS ---
elif plot_id == "mention_analysis_volume":
if not mentions_df.empty and date_col_mentions in mentions_df.columns and not mentions_df[date_col_mentions].isnull().all():
mentions_over_time = mentions_df.set_index(date_col_mentions).resample('W').size().reset_index(name='mention_count')
mentions_over_time.rename(columns={date_col_mentions: 'Week', 'mention_count': 'Mentions'}, inplace=True)
mentions_over_time['Week'] = mentions_over_time['Week'].dt.strftime('%Y-%m-%d (Week of)')
if not mentions_over_time.empty:
summary_text = f"Mentions Volume (Weekly):\n{mentions_over_time.sort_values(by='Week').tail(5).to_string(index=False)}"
else:
summary_text = f"No mention activity found for '{plot_label}' in the selected period."
else:
summary_text = f"Mentions volume data is unavailable for '{plot_label}'."
elif plot_id == "mention_analysis_sentiment":
if not mentions_df.empty and mentions_sentiment_col in mentions_df.columns:
sentiment_counts = mentions_df[mentions_sentiment_col].value_counts().reset_index()
sentiment_counts.columns = ['Sentiment', 'Count']
summary_text = f"Mentions Sentiment Breakdown:\n{sentiment_counts.to_string(index=False)}"
else:
summary_text = f"Mention sentiment data ('{mentions_sentiment_col}') is unavailable for '{plot_label}'."
data_summaries[plot_id] = summary_text
except KeyError as e:
logging.warning(f"KeyError generating summary for {plot_id} ('{plot_label}'): {e}. Using default summary.")
data_summaries[plot_id] = f"Data summary generation error for '{plot_label}' (missing column: {e})."
except Exception as e:
logging.error(f"Error generating summary for {plot_id} ('{plot_label}'): {e}", exc_info=True)
data_summaries[plot_id] = f"Error generating data summary for '{plot_label}'."
return data_summaries |