GuglielmoTor commited on
Commit
77ff02d
·
verified ·
1 Parent(s): 7be0087

Update eb_agent_module.py

Browse files
Files changed (1) hide show
  1. eb_agent_module.py +59 -50
eb_agent_module.py CHANGED
@@ -483,75 +483,84 @@ class EmployerBrandingAgent:
483
  """Enhanced preprocessing to handle date casting issues and ensure chart generation"""
484
  if not self.all_dataframes:
485
  return
486
-
487
- for name, df in self.all_dataframes.items():
 
 
 
 
 
488
  if name.lower() in ['follower_stats', 'followers']:
489
- # Create a copy to avoid modifying original data
490
- df_copy = df.copy()
491
-
492
  # Handle category_name column that contains dates for follower_gains_monthly
493
  if 'category_name' in df_copy.columns and 'follower_count_type' in df_copy.columns:
494
- # Create a proper date column for date-based queries
495
  def extract_date_from_category(row):
496
  if row.get('follower_count_type') == 'follower_gains_monthly':
497
  category_name = str(row.get('category_name', ''))
498
- # Check if it matches YYYY-MM-DD format
499
  import re
500
  date_pattern = r'^\d{4}-\d{2}-\d{2}$'
501
  if re.match(date_pattern, category_name):
502
  return category_name
503
  return None
504
 
505
- # Add extracted_date column for cleaner date operations
506
  df_copy['extracted_date'] = df_copy.apply(extract_date_from_category, axis=1)
507
-
508
- # Convert extracted_date to proper datetime type and handle nulls
509
  df_copy['extracted_date'] = pd.to_datetime(df_copy['extracted_date'], errors='coerce')
510
 
511
- # Create additional helper columns for better analysis
512
  monthly_mask = df_copy['follower_count_type'] == 'follower_gains_monthly'
513
- df_copy.loc[monthly_mask, 'date_for_analysis'] = df_copy.loc[monthly_mask, 'extracted_date']
514
- df_copy.loc[monthly_mask, 'year_month'] = df_copy.loc[monthly_mask, 'extracted_date'].dt.strftime('%Y-%m')
515
- df_copy.loc[monthly_mask, 'month_name'] = df_copy.loc[monthly_mask, 'extracted_date'].dt.strftime('%B %Y')
516
-
517
- # Ensure follower_count is numeric and handle nulls
518
- if 'follower_count' in df_copy.columns:
519
- df_copy['follower_count'] = pd.to_numeric(df_copy['follower_count'], errors='coerce')
520
- df_copy['follower_count'] = df_copy['follower_count'].fillna(0)
521
-
522
- # Create separate monthly gains dataframe for easier analysis
523
- monthly_gains = df_copy[df_copy['follower_count_type'] == 'follower_gains_monthly'].copy()
524
- if not monthly_gains.empty:
525
- monthly_gains = monthly_gains.dropna(subset=['extracted_date'])
526
- monthly_gains = monthly_gains.sort_values('extracted_date')
527
- # Store as separate dataframe
528
- self.all_dataframes[f'{name}_monthly_gains'] = monthly_gains
529
-
530
- # Update the main dataframe
531
- self.all_dataframes[name] = df_copy
532
-
533
- logging.info(f"Preprocessed {name} dataframe for date handling. Monthly records: {len(monthly_gains) if not monthly_gains.empty else 0}")
534
-
535
- # General preprocessing for all dataframes
536
- df_processed = self.all_dataframes[name].copy()
537
-
538
- # Handle common data quality issues
539
- # Convert object columns that should be numeric
540
- for col in df_processed.columns:
541
- if df_processed[col].dtype == 'object':
542
- # Try to convert to numeric if it looks like numbers
543
- if df_processed[col].astype(str).str.match(r'^\d+\.?\d*$').any():
544
- df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
545
 
546
- # Fill nulls in numeric columns with 0 (for charting)
547
- numeric_columns = df_processed.select_dtypes(include=[np.number]).columns
548
- df_processed[numeric_columns] = df_processed[numeric_columns].fillna(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
549
 
550
- # Fill nulls in text columns with empty string
551
- text_columns = df_processed.select_dtypes(include=['object']).columns
552
- df_processed[text_columns] = df_processed[text_columns].fillna('')
553
 
554
- self.all_dataframes[name] = df_processed
 
 
 
 
 
 
555
 
556
 
557
  def _build_system_prompt(self) -> str:
 
483
  """Enhanced preprocessing to handle date casting issues and ensure chart generation"""
484
  if not self.all_dataframes:
485
  return
486
+
487
+ dataframes_to_add = {} # To store newly created dataframes
488
+
489
+ # Iterate over a copy of the items to avoid runtime errors if modifying the dict
490
+ for name, df_original in list(self.all_dataframes.items()):
491
+ df_copy = df_original.copy() # Work on a copy for this iteration step
492
+
493
  if name.lower() in ['follower_stats', 'followers']:
 
 
 
494
  # Handle category_name column that contains dates for follower_gains_monthly
495
  if 'category_name' in df_copy.columns and 'follower_count_type' in df_copy.columns:
 
496
  def extract_date_from_category(row):
497
  if row.get('follower_count_type') == 'follower_gains_monthly':
498
  category_name = str(row.get('category_name', ''))
 
499
  import re
500
  date_pattern = r'^\d{4}-\d{2}-\d{2}$'
501
  if re.match(date_pattern, category_name):
502
  return category_name
503
  return None
504
 
 
505
  df_copy['extracted_date'] = df_copy.apply(extract_date_from_category, axis=1)
 
 
506
  df_copy['extracted_date'] = pd.to_datetime(df_copy['extracted_date'], errors='coerce')
507
 
 
508
  monthly_mask = df_copy['follower_count_type'] == 'follower_gains_monthly'
509
+ # Ensure extracted_date is not NaT before strftime
510
+ valid_dates_mask = monthly_mask & df_copy['extracted_date'].notna()
511
+
512
+ df_copy.loc[valid_dates_mask, 'date_for_analysis'] = df_copy.loc[valid_dates_mask, 'extracted_date']
513
+ df_copy.loc[valid_dates_mask, 'year_month'] = df_copy.loc[valid_dates_mask, 'extracted_date'].dt.strftime('%Y-%m')
514
+ df_copy.loc[valid_dates_mask, 'month_name'] = df_copy.loc[valid_dates_mask, 'extracted_date'].dt.strftime('%B %Y')
515
+
516
+ if 'follower_count' in df_copy.columns:
517
+ df_copy['follower_count'] = pd.to_numeric(df_copy['follower_count'], errors='coerce')
518
+ # df_copy['follower_count'] = df_copy['follower_count'].fillna(0) # Moved to general fillna
519
+
520
+ # Create separate monthly gains dataframe for easier analysis
521
+ if 'follower_count_type' in df_copy.columns and 'extracted_date' in df_copy.columns:
522
+ monthly_gains_df = df_copy[df_copy['follower_count_type'] == 'follower_gains_monthly'].copy()
523
+ if not monthly_gains_df.empty:
524
+ monthly_gains_df = monthly_gains_df.dropna(subset=['extracted_date'])
525
+ if not monthly_gains_df.empty: # Check again after dropna
526
+ monthly_gains_df = monthly_gains_df.sort_values('extracted_date')
527
+ # Store in the temporary dictionary
528
+ dataframes_to_add[f'{name}_monthly_gains'] = monthly_gains_df
529
+ logging.info(f"Created '{name}_monthly_gains' with {len(monthly_gains_df)} records.")
530
+
531
+ # Update the main dataframe in self.all_dataframes with these specific changes
532
+ self.all_dataframes[name] = df_copy.copy() # Save the processed df_copy
533
+ logging.info(f"Preprocessed '{name}' dataframe for date handling.")
 
 
 
 
 
 
 
534
 
535
+ # General preprocessing for the current dataframe (df_copy or df_original if not 'follower_stats')
536
+ # Fetch the potentially modified df_copy if it was processed above, otherwise use original df for this iteration
537
+ current_df_to_process = self.all_dataframes[name].copy()
538
+
539
+ # Convert object columns that look numeric
540
+ for col in current_df_to_process.columns:
541
+ if current_df_to_process[col].dtype == 'object':
542
+ try:
543
+ # Attempt conversion if a good portion of non-null values match numeric pattern
544
+ if current_df_to_process[col].str.match(r'^-?\d+\.?\d*$').sum() > len(current_df_to_process[col].dropna()) * 0.5:
545
+ current_df_to_process[col] = pd.to_numeric(current_df_to_process[col], errors='coerce')
546
+ logging.info(f"Converted column '{col}' in '{name}' to numeric.")
547
+ except AttributeError: # Handles cases where .str accessor fails (e.g. column has mixed types like numbers and lists)
548
+ logging.debug(f"Could not apply .str accessor to column '{col}' in '{name}'. Skipping numeric conversion for it.")
549
+
550
+
551
+ numeric_columns = current_df_to_process.select_dtypes(include=[np.number]).columns
552
+ current_df_to_process[numeric_columns] = current_df_to_process[numeric_columns].fillna(0)
553
 
554
+ text_columns = current_df_to_process.select_dtypes(include=['object']).columns
555
+ current_df_to_process[text_columns] = current_df_to_process[text_columns].fillna('')
 
556
 
557
+ # Update self.all_dataframes with the fully processed version for this key
558
+ self.all_dataframes[name] = current_df_to_process
559
+
560
+ # After the loop, add all newly created dataframes
561
+ if dataframes_to_add:
562
+ self.all_dataframes.update(dataframes_to_add)
563
+ logging.info(f"Added new derived dataframes: {list(dataframes_to_add.keys())}")
564
 
565
 
566
  def _build_system_prompt(self) -> str: