Spaces:

GuglielmoTor
/

LinkedinMonitor

Running

App Files Files Community

GuglielmoTor commited on 10 days ago

Commit

795b267

verified ·

1 Parent(s): dce8999

Update data_processing/analytics_data_processing.py

Browse files

Files changed (1) hide show

data_processing/analytics_data_processing.py +91 -70

data_processing/analytics_data_processing.py CHANGED Viewed

@@ -7,149 +7,170 @@ import numpy as np
 # Configure logging for this module
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(module)s - %(message)s')
 def filter_dataframe_by_date(df, date_column, start_date, end_date):
-    """Filters a DataFrame by a date column within a given date range."""
     if df is None or df.empty or not date_column:
-        logging.warning(f"Filter by date: DataFrame is None, empty, or no date_column provided. DF: {df is not None}, empty: {df.empty if df is not None else 'N/A'}, date_column: {date_column}")
         return pd.DataFrame()
     if date_column not in df.columns:
         logging.warning(f"Filter by date: Date column '{date_column}' not found in DataFrame columns: {df.columns.tolist()}.")
         return pd.DataFrame()
-    df_copy = df.copy() # Work on a copy to avoid SettingWithCopyWarning
     try:
-        # Ensure the date column is pandas datetime objects
         if not pd.api.types.is_datetime64_any_dtype(df_copy[date_column]):
             df_copy[date_column] = pd.to_datetime(df_copy[date_column], errors='coerce')
-        # Drop rows where date conversion might have failed (NaT) or was originally NaT
         df_copy.dropna(subset=[date_column], inplace=True)
         if df_copy.empty:
             logging.info(f"Filter by date: DataFrame empty after to_datetime and dropna for column '{date_column}'.")
             return pd.DataFrame()
-        # Normalize to midnight. This preserves timezone information if present.
         df_copy[date_column] = df_copy[date_column].dt.normalize()
-        # If the column is timezone-aware, convert its values to naive UTC equivalent.
-        # This allows comparison with naive filter dates.
         if hasattr(df_copy[date_column].dt, 'tz') and df_copy[date_column].dt.tz is not None:
-            logging.info(f"Column '{date_column}' is timezone-aware ({df_copy[date_column].dt.tz}). Converting to naive (from UTC) for comparison.")
             df_copy[date_column] = df_copy[date_column].dt.tz_convert('UTC').dt.tz_localize(None)
     except Exception as e:
         logging.error(f"Error processing date column '{date_column}': {e}", exc_info=True)
-        return pd.DataFrame()
-    # Convert start_date and end_date (which are naive Python datetime or naive Pandas Timestamp)
-    # to naive pandas Timestamps and normalize them.
     start_dt_obj = pd.to_datetime(start_date, errors='coerce').normalize() if start_date else None
     end_dt_obj = pd.to_datetime(end_date, errors='coerce').normalize() if end_date else None
-    # Perform the filtering
-    # df_filtered is already df_copy which has NaNs dropped and dates processed
-    if start_dt_obj and end_dt_obj:
-        df_filtered_final = df_copy[(df_copy[date_column] >= start_dt_obj) & (df_copy[date_column] <= end_dt_obj)]
-    elif start_dt_obj:
-        df_filtered_final = df_copy[df_copy[date_column] >= start_dt_obj]
-    elif end_dt_obj:
-        df_filtered_final = df_copy[df_copy[date_column] <= end_dt_obj]
     else:
-        df_filtered_final = df_copy # No date filtering if neither start_date nor end_date is provided
     if df_filtered_final.empty:
         logging.info(f"Filter by date: DataFrame became empty after applying date range to column '{date_column}'.")
     return df_filtered_final
 def prepare_filtered_analytics_data(token_state_value, date_filter_option, custom_start_date, custom_end_date):
     """
     Retrieves data from token_state, determines date range, filters posts, mentions, and follower time-series data.
     Merges posts with post stats.
-    Returns:
-        - filtered_merged_posts_df: Posts merged with stats, filtered by date.
-        - filtered_mentions_df: Mentions filtered by date.
-        - date_filtered_follower_stats_df: Follower stats filtered by date (for time-series plots).
-        - raw_follower_stats_df: Unfiltered follower stats (for demographic plots).
-        - start_dt_filter: Determined start date for filtering.
-        - end_dt_filter: Determined end date for filtering.
     """
     logging.info(f"Preparing filtered analytics data. Filter: {date_filter_option}, Custom Start: {custom_start_date}, Custom End: {custom_end_date}")
     posts_df = token_state_value.get("bubble_posts_df", pd.DataFrame()).copy()
     mentions_df = token_state_value.get("bubble_mentions_df", pd.DataFrame()).copy()
     follower_stats_df = token_state_value.get("bubble_follower_stats_df", pd.DataFrame()).copy()
     post_stats_df = token_state_value.get("bubble_post_stats_df", pd.DataFrame()).copy()
     date_column_posts = token_state_value.get("config_date_col_posts", "published_at")
     date_column_mentions = token_state_value.get("config_date_col_mentions", "date")
-    date_column_followers = token_state_value.get("config_date_col_followers", "date")
     # Determine date range for filtering
     current_datetime_obj = datetime.now()
-    current_time_normalized = current_datetime_obj.replace(hour=0, minute=0, second=0, microsecond=0)
-    end_dt_filter = current_time_normalized
     start_dt_filter = None
-    # --- FIX STARTS HERE ---
-    # The filter option strings from the UI must exactly match the strings being checked here.
-    # The original code checked for "Last 7 Days" but the UI sent "Ultimi 7 Giorni".
     if date_filter_option == "Ultimi 7 Giorni":
-        start_dt_filter = current_time_normalized - timedelta(days=6)
     elif date_filter_option == "Ultimi 30 Giorni":
-        start_dt_filter = current_time_normalized - timedelta(days=29)
     elif date_filter_option == "Intervallo Personalizzato":
         start_dt_filter_temp = pd.to_datetime(custom_start_date, errors='coerce')
-        start_dt_filter = start_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(start_dt_filter_temp) else None
         end_dt_filter_temp = pd.to_datetime(custom_end_date, errors='coerce')
-        # If end date is specified, use it. Otherwise, default to today.
         end_dt_filter = end_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(end_dt_filter_temp) else current_time_normalized
-    # --- FIX ENDS HERE ---
     logging.info(f"Date range for filtering: Start: {start_dt_filter}, End: {end_dt_filter}")
     # Merge posts_df and post_stats_df
     merged_posts_df = pd.DataFrame()
     if not posts_df.empty and not post_stats_df.empty:
         if 'id' in posts_df.columns and 'post_id' in post_stats_df.columns:
             merged_posts_df = pd.merge(posts_df, post_stats_df, left_on='id', right_on='post_id', how='left')
-            logging.info(f"Merged posts_df ({len(posts_df)} rows) and post_stats_df ({len(post_stats_df)} rows) into merged_posts_df ({len(merged_posts_df)} rows).")
         else:
             logging.warning("Cannot merge posts_df and post_stats_df due to missing 'id' or 'post_id' columns.")
             merged_posts_df = posts_df
     elif not posts_df.empty:
-        logging.warning("post_stats_df is empty. Proceeding with posts_df only.")
         merged_posts_df = posts_df
         expected_stat_cols = ['engagement', 'impressionCount', 'clickCount', 'likeCount', 'commentCount', 'shareCount']
         for col in expected_stat_cols:
             if col not in merged_posts_df.columns:
                 merged_posts_df[col] = pd.NA
-    # Filter DataFrames by date
-    filtered_merged_posts_data = pd.DataFrame()
-    if not merged_posts_df.empty and date_column_posts in merged_posts_df.columns:
-        filtered_merged_posts_data = filter_dataframe_by_date(merged_posts_df, date_column_posts, start_dt_filter, end_dt_filter)
-    elif not merged_posts_df.empty:
-        logging.warning(f"Date column '{date_column_posts}' not found in merged_posts_df. Returning unfiltered merged posts data.")
-        filtered_merged_posts_data = merged_posts_df
-    filtered_mentions_data = pd.DataFrame()
-    if not mentions_df.empty and date_column_mentions in mentions_df.columns:
-        filtered_mentions_data = filter_dataframe_by_date(mentions_df, date_column_mentions, start_dt_filter, end_dt_filter)
-    elif not mentions_df.empty:
-        logging.warning(f"Date column '{date_column_mentions}' not found in mentions_df. Returning unfiltered mentions data.")
-        filtered_mentions_data = mentions_df
     date_filtered_follower_stats_df = pd.DataFrame()
-    raw_follower_stats_df = follower_stats_df.copy()
-    if not follower_stats_df.empty and date_column_followers in follower_stats_df.columns:
         date_filtered_follower_stats_df = filter_dataframe_by_date(follower_stats_df, date_column_followers, start_dt_filter, end_dt_filter)
-    elif not follower_stats_df.empty:
-        logging.warning(f"Date column '{date_column_followers}' not found in follower_stats_df. Time-series follower plots might be empty or use unfiltered data.")
-        date_filtered_follower_stats_df = follower_stats_df
     logging.info(f"Processed - Filtered Merged Posts: {len(filtered_merged_posts_data)} rows, Filtered Mentions: {len(filtered_mentions_data)} rows, Date-Filtered Follower Stats: {len(date_filtered_follower_stats_df)} rows.")

 # Configure logging for this module
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(module)s - %(message)s')
+# --- CORRECTED FUNCTION START (V2) ---
 def filter_dataframe_by_date(df, date_column, start_date, end_date):
+    """
+    Filters a DataFrame by a date column within a given date range.
+    This robust version correctly handles both daily ('YYYY-MM-DD') and monthly ('YYYY-MM')
+    date formats by using a two-pass detection system.
+    """
     if df is None or df.empty or not date_column:
+        logging.warning(f"Filter by date: DataFrame is None, empty, or no date_column provided.")
         return pd.DataFrame()
     if date_column not in df.columns:
         logging.warning(f"Filter by date: Date column '{date_column}' not found in DataFrame columns: {df.columns.tolist()}.")
         return pd.DataFrame()
+    df_copy = df.copy()
+    # --- NEW TWO-PASS DETECTION LOGIC ---
+    use_month_logic = False
+    # Pass 1: Check if all non-null values are 'YYYY-MM' strings. This is fast and specific.
+    valid_dates_str = df_copy[date_column].dropna()
+    if pd.api.types.is_string_dtype(valid_dates_str.dtype) and not valid_dates_str.empty:
+        # This regex ensures the entire string is just 'YYYY-MM'
+        if valid_dates_str.str.match(r'^\d{4}-\d{2}$').all():
+            use_month_logic = True
+            logging.info(f"Filter by date (Pass 1): Detected 'YYYY-MM' string format for column '{date_column}'.")
+    # Standardize column to datetime objects for filtering and for the second pass
     try:
         if not pd.api.types.is_datetime64_any_dtype(df_copy[date_column]):
             df_copy[date_column] = pd.to_datetime(df_copy[date_column], errors='coerce')
         df_copy.dropna(subset=[date_column], inplace=True)
         if df_copy.empty:
             logging.info(f"Filter by date: DataFrame empty after to_datetime and dropna for column '{date_column}'.")
             return pd.DataFrame()
         df_copy[date_column] = df_copy[date_column].dt.normalize()
         if hasattr(df_copy[date_column].dt, 'tz') and df_copy[date_column].dt.tz is not None:
             df_copy[date_column] = df_copy[date_column].dt.tz_convert('UTC').dt.tz_localize(None)
     except Exception as e:
         logging.error(f"Error processing date column '{date_column}': {e}", exc_info=True)
+        return pd.DataFrame()
+    # Pass 2: If not detected by string format, check if all dates are the 1st of the month.
+    if not use_month_logic and not df_copy.empty:
+        if (df_copy[date_column].dt.day == 1).all():
+            use_month_logic = True
+            logging.info(f"Filter by date (Pass 2): All dates in '{date_column}' are 1st of the month. Applying month-range filtering.")
+    # --- END OF NEW LOGIC ---
+    # Convert filter start/end dates to normalized, naive Timestamps
     start_dt_obj = pd.to_datetime(start_date, errors='coerce').normalize() if start_date else None
     end_dt_obj = pd.to_datetime(end_date, errors='coerce').normalize() if end_date else None
+    if not start_dt_obj and not end_dt_obj:
+        return df_copy
+    # Perform the filtering based on the detected format
+    if use_month_logic:
+        logging.info(f"Applying month-overlap filtering for column '{date_column}'.")
+        # For monthly data, include a row if its month overlaps with the filter range.
+        df_copy['end_of_month'] = df_copy[date_column] + pd.offsets.MonthEnd(1)
+        filter_start = start_dt_obj if start_dt_obj else pd.Timestamp.min
+        filter_end = end_dt_obj if end_dt_obj else pd.Timestamp.max
+        mask = (df_copy[date_column] <= filter_end) & (df_copy['end_of_month'] >= filter_start)
+        df_filtered_final = df_copy[mask].drop(columns=['end_of_month'])
     else:
+        logging.info(f"Applying standard daily filtering for column '{date_column}'.")
+        # Standard filtering for daily ('YYYY-MM-DD') data
+        df_filtered_final = df_copy
+        if start_dt_obj:
+            df_filtered_final = df_filtered_final[df_filtered_final[date_column] >= start_dt_obj]
+        if end_dt_obj:
+            df_filtered_final = df_filtered_final[df_filtered_final[date_column] <= end_dt_obj]
     if df_filtered_final.empty:
         logging.info(f"Filter by date: DataFrame became empty after applying date range to column '{date_column}'.")
     return df_filtered_final
+# --- CORRECTED FUNCTION END (V2) ---
 def prepare_filtered_analytics_data(token_state_value, date_filter_option, custom_start_date, custom_end_date):
     """
     Retrieves data from token_state, determines date range, filters posts, mentions, and follower time-series data.
     Merges posts with post stats.
     """
     logging.info(f"Preparing filtered analytics data. Filter: {date_filter_option}, Custom Start: {custom_start_date}, Custom End: {custom_end_date}")
     posts_df = token_state_value.get("bubble_posts_df", pd.DataFrame()).copy()
     mentions_df = token_state_value.get("bubble_mentions_df", pd.DataFrame()).copy()
     follower_stats_df = token_state_value.get("bubble_follower_stats_df", pd.DataFrame()).copy()
     post_stats_df = token_state_value.get("bubble_post_stats_df", pd.DataFrame()).copy()
     date_column_posts = token_state_value.get("config_date_col_posts", "published_at")
     date_column_mentions = token_state_value.get("config_date_col_mentions", "date")
+    date_column_followers = token_state_value.get("config_date_col_followers", "date")
+    # --- NEW: PRE-PROCESSING STEP FOR FOLLOWER STATS ---
+    # This block handles the case where date information is in the 'category_name' column.
+    if not follower_stats_df.empty and 'category_name' in follower_stats_df.columns:
+        logging.info("Pre-processing follower_stats_df: Checking 'category_name' for dates.")
+        # Create a series of datetime objects from 'category_name'.
+        # 'coerce' will turn any non-date strings into NaT (Not a Time).
+        category_as_dates = pd.to_datetime(follower_stats_df['category_name'], errors='coerce')
+        # Create a boolean mask for rows where the conversion was successful.
+        valid_dates_mask = category_as_dates.notna()
+        # If any dates were found, update the main 'date' column with them.
+        if valid_dates_mask.any():
+            logging.info(f"Found {valid_dates_mask.sum()} date-like values in 'category_name'. Consolidating them into the '{date_column_followers}' column.")
+            # Use .loc[] to update the 'date' column only for the relevant rows.
+            follower_stats_df.loc[valid_dates_mask, date_column_followers] = category_as_dates[valid_dates_mask]
+    # --- END OF PRE-PROCESSING STEP ---
     # Determine date range for filtering
     current_datetime_obj = datetime.now()
+    current_time_normalized = current_datetime_obj.replace(hour=0, minute=0, second=0, microsecond=0)
+    end_dt_filter = current_time_normalized
     start_dt_filter = None
     if date_filter_option == "Ultimi 7 Giorni":
+        start_dt_filter = current_time_normalized - timedelta(days=6)
     elif date_filter_option == "Ultimi 30 Giorni":
+        start_dt_filter = current_time_normalized - timedelta(days=29)
     elif date_filter_option == "Intervallo Personalizzato":
         start_dt_filter_temp = pd.to_datetime(custom_start_date, errors='coerce')
+        start_dt_filter = start_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(start_dt_filter_temp) else None
         end_dt_filter_temp = pd.to_datetime(custom_end_date, errors='coerce')
         end_dt_filter = end_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(end_dt_filter_temp) else current_time_normalized
     logging.info(f"Date range for filtering: Start: {start_dt_filter}, End: {end_dt_filter}")
     # Merge posts_df and post_stats_df
     merged_posts_df = pd.DataFrame()
     if not posts_df.empty and not post_stats_df.empty:
         if 'id' in posts_df.columns and 'post_id' in post_stats_df.columns:
             merged_posts_df = pd.merge(posts_df, post_stats_df, left_on='id', right_on='post_id', how='left')
         else:
             logging.warning("Cannot merge posts_df and post_stats_df due to missing 'id' or 'post_id' columns.")
             merged_posts_df = posts_df
     elif not posts_df.empty:
         merged_posts_df = posts_df
         expected_stat_cols = ['engagement', 'impressionCount', 'clickCount', 'likeCount', 'commentCount', 'shareCount']
         for col in expected_stat_cols:
             if col not in merged_posts_df.columns:
                 merged_posts_df[col] = pd.NA
+    # Filter DataFrames by date (now using pre-processed follower_stats_df)
+    filtered_merged_posts_data = filter_dataframe_by_date(merged_posts_df, date_column_posts, start_dt_filter, end_dt_filter)
+    filtered_mentions_data = filter_dataframe_by_date(mentions_df, date_column_mentions, start_dt_filter, end_dt_filter)
     date_filtered_follower_stats_df = pd.DataFrame()
+    raw_follower_stats_df = follower_stats_df.copy() # Use a copy of the *original* for raw data
+    if not follower_stats_df.empty:
         date_filtered_follower_stats_df = filter_dataframe_by_date(follower_stats_df, date_column_followers, start_dt_filter, end_dt_filter)
     logging.info(f"Processed - Filtered Merged Posts: {len(filtered_merged_posts_data)} rows, Filtered Mentions: {len(filtered_mentions_data)} rows, Date-Filtered Follower Stats: {len(date_filtered_follower_stats_df)} rows.")