Spaces:

GuglielmoTor
/

LinkedinMonitor

Running

App Files Files Community

GuglielmoTor commited on May 20

Commit

56e12df

verified ·

1 Parent(s): 58eb0f9

Update analytics_data_processing.py

Browse files

Files changed (1) hide show

analytics_data_processing.py +28 -17

analytics_data_processing.py CHANGED Viewed

@@ -16,27 +16,41 @@ def filter_dataframe_by_date(df, date_column, start_date, end_date):
     df_copy = df.copy() # Work on a copy to avoid SettingWithCopyWarning
     try:
-        # Convert the DataFrame's date column to pandas datetime objects first
         if not pd.api.types.is_datetime64_any_dtype(df_copy[date_column]):
             df_copy[date_column] = pd.to_datetime(df_copy[date_column], errors='coerce')
-        # Normalize the DataFrame's date column to midnight (date part only)
         df_copy[date_column] = df_copy[date_column].dt.normalize()
     except Exception as e:
-        logging.error(f"Error converting or normalizing date column '{date_column}' to datetime: {e}")
-        return pd.DataFrame() # Return empty if conversion fails
-    df_filtered = df_copy.dropna(subset=[date_column])
-    if df_filtered.empty:
-        logging.info(f"Filter by date: DataFrame became empty after dropping NaNs in date column '{date_column}'.")
         return pd.DataFrame()
-    # Convert start_date and end_date (which are expected to be datetime.datetime or None)
-    # to pandas Timestamps and normalize them for comparison
     start_dt_obj = pd.to_datetime(start_date, errors='coerce').normalize() if start_date else None
     end_dt_obj = pd.to_datetime(end_date, errors='coerce').normalize() if end_date else None
     if start_dt_obj and end_dt_obj:
         return df_filtered[(df_filtered[date_column] >= start_dt_obj) & (df_filtered[date_column] <= end_dt_obj)]
     elif start_dt_obj:
@@ -62,9 +76,8 @@ def prepare_filtered_analytics_data(token_state_value, date_filter_option, custo
     date_column_mentions = token_state_value.get("config_date_col_mentions", "date")
     # Determine date range for filtering posts and mentions
-    # Normalize current time to midnight using datetime.replace
     current_datetime_obj = datetime.now()
-    current_time_normalized = current_datetime_obj.replace(hour=0, minute=0, second=0, microsecond=0)
     end_dt_filter = current_time_normalized
     start_dt_filter = None
@@ -75,16 +88,14 @@ def prepare_filtered_analytics_data(token_state_value, date_filter_option, custo
         start_dt_filter = current_time_normalized - timedelta(days=29)
     elif date_filter_option == "Custom Range":
         # custom_start_date and custom_end_date are strings from gr.DateTime(type="string")
-        # Convert to datetime objects and then normalize
         start_dt_filter_temp = pd.to_datetime(custom_start_date, errors='coerce')
-        start_dt_filter = start_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(start_dt_filter_temp) else None
         end_dt_filter_temp = pd.to_datetime(custom_end_date, errors='coerce')
-        # If custom_end_date is not provided or invalid, use current_time_normalized
         end_dt_filter = end_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(end_dt_filter_temp) else current_time_normalized
-    # "All Time" means start_dt_filter remains None, end_dt_filter effectively means up to now.
     logging.info(f"Date range for filtering: Start: {start_dt_filter}, End: {end_dt_filter}")
     # Filter DataFrames

     df_copy = df.copy() # Work on a copy to avoid SettingWithCopyWarning
     try:
+        # Ensure the date column is pandas datetime objects
         if not pd.api.types.is_datetime64_any_dtype(df_copy[date_column]):
             df_copy[date_column] = pd.to_datetime(df_copy[date_column], errors='coerce')
+        # Drop rows where date conversion might have failed (NaT) or was originally NaT
+        df_copy.dropna(subset=[date_column], inplace=True)
+        if df_copy.empty:
+             logging.info(f"Filter by date: DataFrame empty after to_datetime and dropna for column '{date_column}'.")
+             return pd.DataFrame()
+        # Normalize to midnight. This preserves timezone information if present.
         df_copy[date_column] = df_copy[date_column].dt.normalize()
+        # If the column is timezone-aware, convert its values to naive UTC equivalent.
+        # This allows comparison with naive filter dates.
+        if hasattr(df_copy[date_column].dt, 'tz') and df_copy[date_column].dt.tz is not None:
+            logging.info(f"Column '{date_column}' is timezone-aware ({df_copy[date_column].dt.tz}). Converting to naive (from UTC) for comparison.")
+            df_copy[date_column] = df_copy[date_column].dt.tz_convert('UTC').dt.tz_localize(None)
     except Exception as e:
+        logging.error(f"Error processing date column '{date_column}': {e}", exc_info=True)
+        return pd.DataFrame()
+    df_filtered = df_copy # df_copy is now processed and potentially filtered by dropna
+    # No need for: df_filtered = df_copy.dropna(subset=[date_column]) again here.
+    if df_filtered.empty: # Check again in case all rows were dropped or some other issue.
+        logging.info(f"Filter by date: DataFrame became empty after processing date column '{date_column}'.")
         return pd.DataFrame()
+    # Convert start_date and end_date (which are naive Python datetime or naive Pandas Timestamp)
+    # to naive pandas Timestamps and normalize them.
     start_dt_obj = pd.to_datetime(start_date, errors='coerce').normalize() if start_date else None
     end_dt_obj = pd.to_datetime(end_date, errors='coerce').normalize() if end_date else None
+    # Perform the filtering
     if start_dt_obj and end_dt_obj:
         return df_filtered[(df_filtered[date_column] >= start_dt_obj) & (df_filtered[date_column] <= end_dt_obj)]
     elif start_dt_obj:
     date_column_mentions = token_state_value.get("config_date_col_mentions", "date")
     # Determine date range for filtering posts and mentions
     current_datetime_obj = datetime.now()
+    current_time_normalized = current_datetime_obj.replace(hour=0, minute=0, second=0, microsecond=0) # Naive Python datetime
     end_dt_filter = current_time_normalized
     start_dt_filter = None
         start_dt_filter = current_time_normalized - timedelta(days=29)
     elif date_filter_option == "Custom Range":
         # custom_start_date and custom_end_date are strings from gr.DateTime(type="string")
+        # Convert to pandas Timestamp (which will be naive if input string is naive) then normalize using pandas method
         start_dt_filter_temp = pd.to_datetime(custom_start_date, errors='coerce')
+        # .replace() on pandas Timestamp normalizes time part
+        start_dt_filter = start_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(start_dt_filter_temp) else None
         end_dt_filter_temp = pd.to_datetime(custom_end_date, errors='coerce')
         end_dt_filter = end_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(end_dt_filter_temp) else current_time_normalized
     logging.info(f"Date range for filtering: Start: {start_dt_filter}, End: {end_dt_filter}")
     # Filter DataFrames