Spaces:

GuglielmoTor
/

LinkedinMonitor

Running

File size: 6,498 Bytes

e3cbb18
58eb0f9
e3cbb18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56e12df
e3cbb18
 
56e12df
 
 
 
 
 
 
 
58eb0f9
 
56e12df
 
 
 
 
 
e3cbb18
56e12df
 
e3cbb18
56e12df
 
 
 
e3cbb18
 
56e12df
 
e3cbb18
 
 
56e12df
e3cbb18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58eb0f9
56e12df
58eb0f9
e3cbb18
 
 
 
58eb0f9
e3cbb18
58eb0f9
e3cbb18
58eb0f9
56e12df
58eb0f9
56e12df
 
58eb0f9
 
 
 
e3cbb18

import pandas as pd
from datetime import datetime, timedelta, time # Added time for min.time
import logging

# Configure logging for this module
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(module)s - %(message)s')

def filter_dataframe_by_date(df, date_column, start_date, end_date):
    """Filters a DataFrame by a date column within a given date range."""
    if df is None or df.empty or not date_column:
        logging.warning(f"Filter by date: DataFrame is None, empty, or no date_column provided. DF: {df is not None}, empty: {df.empty if df is not None else 'N/A'}, date_column: {date_column}")
        return pd.DataFrame()
    if date_column not in df.columns:
        logging.warning(f"Filter by date: Date column '{date_column}' not found in DataFrame columns: {df.columns.tolist()}.")
        return pd.DataFrame()
    
    df_copy = df.copy() # Work on a copy to avoid SettingWithCopyWarning
    try:
        # Ensure the date column is pandas datetime objects
        if not pd.api.types.is_datetime64_any_dtype(df_copy[date_column]):
            df_copy[date_column] = pd.to_datetime(df_copy[date_column], errors='coerce')

        # Drop rows where date conversion might have failed (NaT) or was originally NaT
        df_copy.dropna(subset=[date_column], inplace=True)
        if df_copy.empty:
             logging.info(f"Filter by date: DataFrame empty after to_datetime and dropna for column '{date_column}'.")
             return pd.DataFrame()

        # Normalize to midnight. This preserves timezone information if present.
        df_copy[date_column] = df_copy[date_column].dt.normalize()

        # If the column is timezone-aware, convert its values to naive UTC equivalent.
        # This allows comparison with naive filter dates.
        if hasattr(df_copy[date_column].dt, 'tz') and df_copy[date_column].dt.tz is not None:
            logging.info(f"Column '{date_column}' is timezone-aware ({df_copy[date_column].dt.tz}). Converting to naive (from UTC) for comparison.")
            df_copy[date_column] = df_copy[date_column].dt.tz_convert('UTC').dt.tz_localize(None)

    except Exception as e:
        logging.error(f"Error processing date column '{date_column}': {e}", exc_info=True)
        return pd.DataFrame() 

    df_filtered = df_copy # df_copy is now processed and potentially filtered by dropna
    # No need for: df_filtered = df_copy.dropna(subset=[date_column]) again here.
    if df_filtered.empty: # Check again in case all rows were dropped or some other issue.
        logging.info(f"Filter by date: DataFrame became empty after processing date column '{date_column}'.")
        return pd.DataFrame()

    # Convert start_date and end_date (which are naive Python datetime or naive Pandas Timestamp)
    # to naive pandas Timestamps and normalize them.
    start_dt_obj = pd.to_datetime(start_date, errors='coerce').normalize() if start_date else None
    end_dt_obj = pd.to_datetime(end_date, errors='coerce').normalize() if end_date else None

    # Perform the filtering
    if start_dt_obj and end_dt_obj:
        return df_filtered[(df_filtered[date_column] >= start_dt_obj) & (df_filtered[date_column] <= end_dt_obj)]
    elif start_dt_obj:
        return df_filtered[df_filtered[date_column] >= start_dt_obj]
    elif end_dt_obj:
        return df_filtered[df_filtered[date_column] <= end_dt_obj]
    return df_filtered # No date filtering if neither start_date nor end_date is provided


def prepare_filtered_analytics_data(token_state_value, date_filter_option, custom_start_date, custom_end_date):
    """
    Retrieves data from token_state, determines date range, filters posts and mentions.
    Returns filtered_posts_df, filtered_mentions_df, follower_stats_df (unfiltered),
    and the determined start_dt, end_dt for messaging.
    """
    logging.info(f"Preparing filtered analytics data. Filter: {date_filter_option}, Custom Start: {custom_start_date}, Custom End: {custom_end_date}")

    posts_df = token_state_value.get("bubble_posts_df", pd.DataFrame())
    mentions_df = token_state_value.get("bubble_mentions_df", pd.DataFrame())
    follower_stats_df = token_state_value.get("bubble_follower_stats_df", pd.DataFrame()) 

    date_column_posts = token_state_value.get("config_date_col_posts", "published_at")
    date_column_mentions = token_state_value.get("config_date_col_mentions", "date")

    # Determine date range for filtering posts and mentions
    current_datetime_obj = datetime.now()
    current_time_normalized = current_datetime_obj.replace(hour=0, minute=0, second=0, microsecond=0) # Naive Python datetime
    
    end_dt_filter = current_time_normalized 
    start_dt_filter = None

    if date_filter_option == "Last 7 Days":
        start_dt_filter = current_time_normalized - timedelta(days=6) 
    elif date_filter_option == "Last 30 Days":
        start_dt_filter = current_time_normalized - timedelta(days=29) 
    elif date_filter_option == "Custom Range":
        # custom_start_date and custom_end_date are strings from gr.DateTime(type="string")
        # Convert to pandas Timestamp (which will be naive if input string is naive) then normalize using pandas method
        start_dt_filter_temp = pd.to_datetime(custom_start_date, errors='coerce')
        # .replace() on pandas Timestamp normalizes time part
        start_dt_filter = start_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(start_dt_filter_temp) else None 
        
        end_dt_filter_temp = pd.to_datetime(custom_end_date, errors='coerce')
        end_dt_filter = end_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(end_dt_filter_temp) else current_time_normalized
    
    logging.info(f"Date range for filtering: Start: {start_dt_filter}, End: {end_dt_filter}")

    # Filter DataFrames
    filtered_posts_data = pd.DataFrame()
    if not posts_df.empty:
        filtered_posts_data = filter_dataframe_by_date(posts_df, date_column_posts, start_dt_filter, end_dt_filter)
    
    filtered_mentions_data = pd.DataFrame()
    if not mentions_df.empty:
        filtered_mentions_data = filter_dataframe_by_date(mentions_df, date_column_mentions, start_dt_filter, end_dt_filter)

    logging.info(f"Processed - Filtered posts: {len(filtered_posts_data)} rows, Filtered Mentions: {len(filtered_mentions_data)} rows.")
    
    return filtered_posts_data, filtered_mentions_data, follower_stats_df, start_dt_filter, end_dt_filter