Spaces:
Running
Running
File size: 9,725 Bytes
e3cbb18 a11780d e3cbb18 56e12df e3cbb18 56e12df a11780d 56e12df 58eb0f9 56e12df e3cbb18 56e12df e3cbb18 56e12df e3cbb18 56e12df a11780d e3cbb18 a11780d e3cbb18 a11780d e3cbb18 a11780d e3cbb18 a11780d e3cbb18 a11780d e3cbb18 a11780d e3cbb18 a11780d 58eb0f9 a11780d 58eb0f9 e3cbb18 58eb0f9 e3cbb18 58eb0f9 e3cbb18 58eb0f9 56e12df 58eb0f9 e3cbb18 a11780d e3cbb18 a11780d e3cbb18 a11780d e3cbb18 a11780d e3cbb18 a11780d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import pandas as pd
from datetime import datetime, timedelta, time
import logging
# Configure logging for this module
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(module)s - %(message)s')
def filter_dataframe_by_date(df, date_column, start_date, end_date):
"""Filters a DataFrame by a date column within a given date range."""
if df is None or df.empty or not date_column:
logging.warning(f"Filter by date: DataFrame is None, empty, or no date_column provided. DF: {df is not None}, empty: {df.empty if df is not None else 'N/A'}, date_column: {date_column}")
return pd.DataFrame()
if date_column not in df.columns:
logging.warning(f"Filter by date: Date column '{date_column}' not found in DataFrame columns: {df.columns.tolist()}.")
return pd.DataFrame()
df_copy = df.copy() # Work on a copy to avoid SettingWithCopyWarning
try:
# Ensure the date column is pandas datetime objects
if not pd.api.types.is_datetime64_any_dtype(df_copy[date_column]):
df_copy[date_column] = pd.to_datetime(df_copy[date_column], errors='coerce')
# Drop rows where date conversion might have failed (NaT) or was originally NaT
df_copy.dropna(subset=[date_column], inplace=True)
if df_copy.empty:
logging.info(f"Filter by date: DataFrame empty after to_datetime and dropna for column '{date_column}'.")
return pd.DataFrame()
# Normalize to midnight. This preserves timezone information if present.
df_copy[date_column] = df_copy[date_column].dt.normalize()
# If the column is timezone-aware, convert its values to naive UTC equivalent.
# This allows comparison with naive filter dates.
if hasattr(df_copy[date_column].dt, 'tz') and df_copy[date_column].dt.tz is not None:
logging.info(f"Column '{date_column}' is timezone-aware ({df_copy[date_column].dt.tz}). Converting to naive (from UTC) for comparison.")
df_copy[date_column] = df_copy[date_column].dt.tz_convert('UTC').dt.tz_localize(None)
except Exception as e:
logging.error(f"Error processing date column '{date_column}': {e}", exc_info=True)
return pd.DataFrame()
# Convert start_date and end_date (which are naive Python datetime or naive Pandas Timestamp)
# to naive pandas Timestamps and normalize them.
start_dt_obj = pd.to_datetime(start_date, errors='coerce').normalize() if start_date else None
end_dt_obj = pd.to_datetime(end_date, errors='coerce').normalize() if end_date else None
# Perform the filtering
# df_filtered is already df_copy which has NaNs dropped and dates processed
if start_dt_obj and end_dt_obj:
df_filtered_final = df_copy[(df_copy[date_column] >= start_dt_obj) & (df_copy[date_column] <= end_dt_obj)]
elif start_dt_obj:
df_filtered_final = df_copy[df_copy[date_column] >= start_dt_obj]
elif end_dt_obj:
df_filtered_final = df_copy[df_copy[date_column] <= end_dt_obj]
else:
df_filtered_final = df_copy # No date filtering if neither start_date nor end_date is provided
if df_filtered_final.empty:
logging.info(f"Filter by date: DataFrame became empty after applying date range to column '{date_column}'.")
return df_filtered_final
def prepare_filtered_analytics_data(token_state_value, date_filter_option, custom_start_date, custom_end_date):
"""
Retrieves data from token_state, determines date range, filters posts, mentions, and follower time-series data.
Merges posts with post stats.
Returns:
- filtered_merged_posts_df: Posts merged with stats, filtered by date.
- filtered_mentions_df: Mentions filtered by date.
- date_filtered_follower_stats_df: Follower stats filtered by date (for time-series plots).
- raw_follower_stats_df: Unfiltered follower stats (for demographic plots).
- start_dt_filter: Determined start date for filtering.
- end_dt_filter: Determined end date for filtering.
"""
logging.info(f"Preparing filtered analytics data. Filter: {date_filter_option}, Custom Start: {custom_start_date}, Custom End: {custom_end_date}")
posts_df = token_state_value.get("bubble_posts_df", pd.DataFrame()).copy()
mentions_df = token_state_value.get("bubble_mentions_df", pd.DataFrame()).copy()
follower_stats_df = token_state_value.get("bubble_follower_stats_df", pd.DataFrame()).copy()
post_stats_df = token_state_value.get("bubble_post_stats_df", pd.DataFrame()).copy() # Fetch post_stats_df
date_column_posts = token_state_value.get("config_date_col_posts", "published_at")
date_column_mentions = token_state_value.get("config_date_col_mentions", "date")
# Assuming follower_stats_df has a 'date' column for time-series data
date_column_followers = token_state_value.get("config_date_col_followers", "date")
# Determine date range for filtering
current_datetime_obj = datetime.now()
current_time_normalized = current_datetime_obj.replace(hour=0, minute=0, second=0, microsecond=0)
end_dt_filter = current_time_normalized
start_dt_filter = None
if date_filter_option == "Last 7 Days":
start_dt_filter = current_time_normalized - timedelta(days=6)
elif date_filter_option == "Last 30 Days":
start_dt_filter = current_time_normalized - timedelta(days=29)
elif date_filter_option == "Custom Range":
start_dt_filter_temp = pd.to_datetime(custom_start_date, errors='coerce')
start_dt_filter = start_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(start_dt_filter_temp) else None
end_dt_filter_temp = pd.to_datetime(custom_end_date, errors='coerce')
end_dt_filter = end_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(end_dt_filter_temp) else current_time_normalized
logging.info(f"Date range for filtering: Start: {start_dt_filter}, End: {end_dt_filter}")
# Merge posts_df and post_stats_df
merged_posts_df = pd.DataFrame()
if not posts_df.empty and not post_stats_df.empty:
# Assuming posts_df has 'id' and post_stats_df has 'post_id' for merging
if 'id' in posts_df.columns and 'post_id' in post_stats_df.columns:
merged_posts_df = pd.merge(posts_df, post_stats_df, left_on='id', right_on='post_id', how='left')
logging.info(f"Merged posts_df ({len(posts_df)} rows) and post_stats_df ({len(post_stats_df)} rows) into merged_posts_df ({len(merged_posts_df)} rows).")
else:
logging.warning("Cannot merge posts_df and post_stats_df due to missing 'id' or 'post_id' columns.")
# Fallback to using posts_df if merge fails but provide an empty df for stats-dependent plots
merged_posts_df = posts_df # Or handle as an error / empty DF for those plots
elif not posts_df.empty:
logging.warning("post_stats_df is empty. Proceeding with posts_df only for plots that don't require stats.")
merged_posts_df = posts_df # Create necessary columns with NaN if they are expected by plots
# For columns expected from post_stats_df, add them with NaNs if not present
expected_stat_cols = ['engagement', 'impressionCount', 'clickCount', 'likeCount', 'commentCount', 'shareCount']
for col in expected_stat_cols:
if col not in merged_posts_df.columns:
merged_posts_df[col] = pd.NA
# Filter DataFrames by date
filtered_merged_posts_data = pd.DataFrame()
if not merged_posts_df.empty and date_column_posts in merged_posts_df.columns:
filtered_merged_posts_data = filter_dataframe_by_date(merged_posts_df, date_column_posts, start_dt_filter, end_dt_filter)
elif not merged_posts_df.empty:
logging.warning(f"Date column '{date_column_posts}' not found in merged_posts_df. Returning unfiltered merged posts data.")
filtered_merged_posts_data = merged_posts_df # Or apply other logic
filtered_mentions_data = pd.DataFrame()
if not mentions_df.empty and date_column_mentions in mentions_df.columns:
filtered_mentions_data = filter_dataframe_by_date(mentions_df, date_column_mentions, start_dt_filter, end_dt_filter)
elif not mentions_df.empty:
logging.warning(f"Date column '{date_column_mentions}' not found in mentions_df. Returning unfiltered mentions data.")
filtered_mentions_data = mentions_df
date_filtered_follower_stats_df = pd.DataFrame()
raw_follower_stats_df = follower_stats_df.copy() # For demographic plots, use raw (or latest snapshot logic)
if not follower_stats_df.empty and date_column_followers in follower_stats_df.columns:
date_filtered_follower_stats_df = filter_dataframe_by_date(follower_stats_df, date_column_followers, start_dt_filter, end_dt_filter)
elif not follower_stats_df.empty:
logging.warning(f"Date column '{date_column_followers}' not found in follower_stats_df. Time-series follower plots might be empty or use unfiltered data.")
# Decide if date_filtered_follower_stats_df should be raw_follower_stats_df or empty
date_filtered_follower_stats_df = follower_stats_df # Or pd.DataFrame() if strict filtering is required
logging.info(f"Processed - Filtered Merged Posts: {len(filtered_merged_posts_data)} rows, Filtered Mentions: {len(filtered_mentions_data)} rows, Date-Filtered Follower Stats: {len(date_filtered_follower_stats_df)} rows.")
return filtered_merged_posts_data, filtered_mentions_data, date_filtered_follower_stats_df, raw_follower_stats_df, start_dt_filter, end_dt_filter
|