GuglielmoTor commited on
Commit
56e12df
·
verified ·
1 Parent(s): 58eb0f9

Update analytics_data_processing.py

Browse files
Files changed (1) hide show
  1. analytics_data_processing.py +28 -17
analytics_data_processing.py CHANGED
@@ -16,27 +16,41 @@ def filter_dataframe_by_date(df, date_column, start_date, end_date):
16
 
17
  df_copy = df.copy() # Work on a copy to avoid SettingWithCopyWarning
18
  try:
19
- # Convert the DataFrame's date column to pandas datetime objects first
20
  if not pd.api.types.is_datetime64_any_dtype(df_copy[date_column]):
21
  df_copy[date_column] = pd.to_datetime(df_copy[date_column], errors='coerce')
22
- # Normalize the DataFrame's date column to midnight (date part only)
 
 
 
 
 
 
 
23
  df_copy[date_column] = df_copy[date_column].dt.normalize()
24
 
 
 
 
 
 
 
25
  except Exception as e:
26
- logging.error(f"Error converting or normalizing date column '{date_column}' to datetime: {e}")
27
- return pd.DataFrame() # Return empty if conversion fails
28
 
29
- df_filtered = df_copy.dropna(subset=[date_column])
30
- if df_filtered.empty:
31
- logging.info(f"Filter by date: DataFrame became empty after dropping NaNs in date column '{date_column}'.")
 
32
  return pd.DataFrame()
33
 
34
- # Convert start_date and end_date (which are expected to be datetime.datetime or None)
35
- # to pandas Timestamps and normalize them for comparison
36
  start_dt_obj = pd.to_datetime(start_date, errors='coerce').normalize() if start_date else None
37
  end_dt_obj = pd.to_datetime(end_date, errors='coerce').normalize() if end_date else None
38
 
39
-
40
  if start_dt_obj and end_dt_obj:
41
  return df_filtered[(df_filtered[date_column] >= start_dt_obj) & (df_filtered[date_column] <= end_dt_obj)]
42
  elif start_dt_obj:
@@ -62,9 +76,8 @@ def prepare_filtered_analytics_data(token_state_value, date_filter_option, custo
62
  date_column_mentions = token_state_value.get("config_date_col_mentions", "date")
63
 
64
  # Determine date range for filtering posts and mentions
65
- # Normalize current time to midnight using datetime.replace
66
  current_datetime_obj = datetime.now()
67
- current_time_normalized = current_datetime_obj.replace(hour=0, minute=0, second=0, microsecond=0)
68
 
69
  end_dt_filter = current_time_normalized
70
  start_dt_filter = None
@@ -75,16 +88,14 @@ def prepare_filtered_analytics_data(token_state_value, date_filter_option, custo
75
  start_dt_filter = current_time_normalized - timedelta(days=29)
76
  elif date_filter_option == "Custom Range":
77
  # custom_start_date and custom_end_date are strings from gr.DateTime(type="string")
78
- # Convert to datetime objects and then normalize
79
  start_dt_filter_temp = pd.to_datetime(custom_start_date, errors='coerce')
80
- start_dt_filter = start_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(start_dt_filter_temp) else None
 
81
 
82
  end_dt_filter_temp = pd.to_datetime(custom_end_date, errors='coerce')
83
- # If custom_end_date is not provided or invalid, use current_time_normalized
84
  end_dt_filter = end_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(end_dt_filter_temp) else current_time_normalized
85
 
86
- # "All Time" means start_dt_filter remains None, end_dt_filter effectively means up to now.
87
-
88
  logging.info(f"Date range for filtering: Start: {start_dt_filter}, End: {end_dt_filter}")
89
 
90
  # Filter DataFrames
 
16
 
17
  df_copy = df.copy() # Work on a copy to avoid SettingWithCopyWarning
18
  try:
19
+ # Ensure the date column is pandas datetime objects
20
  if not pd.api.types.is_datetime64_any_dtype(df_copy[date_column]):
21
  df_copy[date_column] = pd.to_datetime(df_copy[date_column], errors='coerce')
22
+
23
+ # Drop rows where date conversion might have failed (NaT) or was originally NaT
24
+ df_copy.dropna(subset=[date_column], inplace=True)
25
+ if df_copy.empty:
26
+ logging.info(f"Filter by date: DataFrame empty after to_datetime and dropna for column '{date_column}'.")
27
+ return pd.DataFrame()
28
+
29
+ # Normalize to midnight. This preserves timezone information if present.
30
  df_copy[date_column] = df_copy[date_column].dt.normalize()
31
 
32
+ # If the column is timezone-aware, convert its values to naive UTC equivalent.
33
+ # This allows comparison with naive filter dates.
34
+ if hasattr(df_copy[date_column].dt, 'tz') and df_copy[date_column].dt.tz is not None:
35
+ logging.info(f"Column '{date_column}' is timezone-aware ({df_copy[date_column].dt.tz}). Converting to naive (from UTC) for comparison.")
36
+ df_copy[date_column] = df_copy[date_column].dt.tz_convert('UTC').dt.tz_localize(None)
37
+
38
  except Exception as e:
39
+ logging.error(f"Error processing date column '{date_column}': {e}", exc_info=True)
40
+ return pd.DataFrame()
41
 
42
+ df_filtered = df_copy # df_copy is now processed and potentially filtered by dropna
43
+ # No need for: df_filtered = df_copy.dropna(subset=[date_column]) again here.
44
+ if df_filtered.empty: # Check again in case all rows were dropped or some other issue.
45
+ logging.info(f"Filter by date: DataFrame became empty after processing date column '{date_column}'.")
46
  return pd.DataFrame()
47
 
48
+ # Convert start_date and end_date (which are naive Python datetime or naive Pandas Timestamp)
49
+ # to naive pandas Timestamps and normalize them.
50
  start_dt_obj = pd.to_datetime(start_date, errors='coerce').normalize() if start_date else None
51
  end_dt_obj = pd.to_datetime(end_date, errors='coerce').normalize() if end_date else None
52
 
53
+ # Perform the filtering
54
  if start_dt_obj and end_dt_obj:
55
  return df_filtered[(df_filtered[date_column] >= start_dt_obj) & (df_filtered[date_column] <= end_dt_obj)]
56
  elif start_dt_obj:
 
76
  date_column_mentions = token_state_value.get("config_date_col_mentions", "date")
77
 
78
  # Determine date range for filtering posts and mentions
 
79
  current_datetime_obj = datetime.now()
80
+ current_time_normalized = current_datetime_obj.replace(hour=0, minute=0, second=0, microsecond=0) # Naive Python datetime
81
 
82
  end_dt_filter = current_time_normalized
83
  start_dt_filter = None
 
88
  start_dt_filter = current_time_normalized - timedelta(days=29)
89
  elif date_filter_option == "Custom Range":
90
  # custom_start_date and custom_end_date are strings from gr.DateTime(type="string")
91
+ # Convert to pandas Timestamp (which will be naive if input string is naive) then normalize using pandas method
92
  start_dt_filter_temp = pd.to_datetime(custom_start_date, errors='coerce')
93
+ # .replace() on pandas Timestamp normalizes time part
94
+ start_dt_filter = start_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(start_dt_filter_temp) else None
95
 
96
  end_dt_filter_temp = pd.to_datetime(custom_end_date, errors='coerce')
 
97
  end_dt_filter = end_dt_filter_temp.replace(hour=0, minute=0, second=0, microsecond=0) if pd.notna(end_dt_filter_temp) else current_time_normalized
98
 
 
 
99
  logging.info(f"Date range for filtering: Start: {start_dt_filter}, End: {end_dt_filter}")
100
 
101
  # Filter DataFrames