Spaces:
Running
Running
File size: 6,498 Bytes
e3cbb18 58eb0f9 e3cbb18 56e12df e3cbb18 56e12df 58eb0f9 56e12df e3cbb18 56e12df e3cbb18 56e12df e3cbb18 56e12df e3cbb18 56e12df e3cbb18 58eb0f9 56e12df 58eb0f9 e3cbb18 58eb0f9 e3cbb18 58eb0f9 e3cbb18 58eb0f9 56e12df 58eb0f9 56e12df 58eb0f9 e3cbb18 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import pandas as pd
from datetime import datetime, timedelta, time # Added time for min.time
import logging
# Configure logging for this module
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(module)s - %(message)s')
def filter_dataframe_by_date(df, date_column, start_date, end_date):
"""Filters a DataFrame by a date column within a given date range."""
if df is None or df.empty or not date_column:
logging.warning(f"Filter by date: DataFrame is None, empty, or no date_column provided. DF: {df is not None}, empty: {df.empty if df is not None else 'N/A'}, date_column: {date_column}")
return pd.DataFrame()
if date_column not in df.columns:
logging.warning(f"Filter by date: Date column '{date_column}' not found in DataFrame columns: {df.columns.tolist()}.")
return pd.DataFrame()
df_copy = df.copy() # Work on a copy to avoid SettingWithCopyWarning
try:
# Ensure the date column is pandas datetime objects
if not pd.api.types.is_datetime64_any_dtype(df_copy[date_column]):
df_copy[date_column] = pd.to_datetime(df_copy[date_column], errors='coerce')
# Drop rows where date conversion might have failed (NaT) or was originally NaT
df_copy.dropna(subset=[date_column], inplace=True)
if df_copy.empty:
logging.info(f"Filter by date: DataFrame empty after to_datetime and dropna for column '{date_column}'.")
return pd.DataFrame()
# Normalize to midnight. This preserves timezone information if present.
df_copy[date_column] = df_copy[date_column].dt.normalize()
# If the column is timezone-aware, convert its values to naive UTC equivalent.
# This allows comparison with naive filter dates.
if hasattr(df_copy[date_column].dt, 'tz') and df_copy[date_column].dt.tz is not None:
logging.info(f"Column '{date_column}' is timezone-aware ({df_copy[date_column].dt.tz}). Converting to naive (from UTC) for comparison.")
df_copy[date_column] = df_copy[date_column].dt.tz_convert('UTC').dt.tz_localize(None)
except Exception as e:
logging.error(f"Error processing date column '{date_column}': {e}", exc_info=True)
return pd.DataFrame()
df_filtered = df_copy # df_copy is now processed and potentially filtered by dropna
# No need for: df_filtered = df_copy.dropna(subset=[date_column]) again here.
if df_filtered.empty: # Check again in case all rows were dropped or some other issue.
logging.info(f"Filter by date: DataFrame became empty after processing date column '{date_column}'.")
return pd.DataFrame()
# Convert start_date and end_date (which are naive Python datetime or naive Pandas Timestamp)
# to naive pandas Timestamps and normalize them.
start_dt_obj = pd.to_datetime(start_date, errors='coerce').normalize() if start_date else None
end_dt_obj = pd.to_datetime(end_date, errors='coerce').normalize() if end_date else None
# Perform the filtering
if start_dt_obj and end_dt_obj:
return df_filtered[(df_filtered[date_column] >= start_dt_obj) & (df_filtered[date_column] <= end_dt_obj)]
elif start_dt_obj:
return df_filtered[df_filtered[date_column] >= start_dt_obj]
elif end_dt_obj:
return df_filtered[df_filtered[date_column] <= end_dt_obj]
return df_filtered # No date filtering if neither start_date nor end_date is provided
def prepare_filtered_analytics_data(token_state_value, date_filter_option, custom_start_date, custom_end_date):
"""
Retrieves data from token_state, determines date range, filters posts and mentions.
Returns filtered_posts_df, filtered_mentions_df, follower_stats_df (unfiltered),
and the determined start_dt, end_dt for messaging.
"""
logging.info(f"Preparing filtered analytics data. Filter: {date_filter_option}, Custom Start: {custom_start_date}, Custom End: {custom_end_date}")
posts_df = token_state_value.get("bubble_posts_df", pd.DataFrame())
mentions_df = token_state_value.get("bubble_mentions_df", pd.DataFrame())
follower_stats_df = token_state_value.get("bubble_follower_stats_df", pd.DataFrame())
date_column_posts = token_state_value.get("config_date_col_posts", "published_at")
date_column_mentions = token_state_value.get("config_date_col_mentions", "date")
# Determine date range for filtering posts and mentions
current_datetime_obj = datetime.now()
current_time_normalized = current_datetime_obj.replace(hour=0, minute=0, second=0, microsecond=0) # Naive Python datetime
end_dt_filter = current_time_normalized
start_dt_filter = None
if date_filter_option == "Last 7 Days":
start_dt_filter = current_time_normalized - timedelta(days=6)
elif date_filter_option == "Last 30 Days":
start_dt_filter = current_time_normalized - timedelta(days=29)
elif date_filter_option == "Custom Range":
# custom_start_date and custom_end_date are strings from gr.DateTime(type="string")
# Convert to pandas Timestamp (which will be naive if input string is naive) then normalize using pandas method
start_dt_filter_temp = pd.to_datetime(custom_start_date, errors='coerce')
# .replace() on pandas Timestamp normalizes time part
start_dt_filter = start_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(start_dt_filter_temp) else None
end_dt_filter_temp = pd.to_datetime(custom_end_date, errors='coerce')
end_dt_filter = end_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(end_dt_filter_temp) else current_time_normalized
logging.info(f"Date range for filtering: Start: {start_dt_filter}, End: {end_dt_filter}")
# Filter DataFrames
filtered_posts_data = pd.DataFrame()
if not posts_df.empty:
filtered_posts_data = filter_dataframe_by_date(posts_df, date_column_posts, start_dt_filter, end_dt_filter)
filtered_mentions_data = pd.DataFrame()
if not mentions_df.empty:
filtered_mentions_data = filter_dataframe_by_date(mentions_df, date_column_mentions, start_dt_filter, end_dt_filter)
logging.info(f"Processed - Filtered posts: {len(filtered_posts_data)} rows, Filtered Mentions: {len(filtered_mentions_data)} rows.")
return filtered_posts_data, filtered_mentions_data, follower_stats_df, start_dt_filter, end_dt_filter
|