Spaces:
Running
Running
Update eb_agent_module.py
Browse files- eb_agent_module.py +59 -50
eb_agent_module.py
CHANGED
@@ -483,75 +483,84 @@ class EmployerBrandingAgent:
|
|
483 |
"""Enhanced preprocessing to handle date casting issues and ensure chart generation"""
|
484 |
if not self.all_dataframes:
|
485 |
return
|
486 |
-
|
487 |
-
|
|
|
|
|
|
|
|
|
|
|
488 |
if name.lower() in ['follower_stats', 'followers']:
|
489 |
-
# Create a copy to avoid modifying original data
|
490 |
-
df_copy = df.copy()
|
491 |
-
|
492 |
# Handle category_name column that contains dates for follower_gains_monthly
|
493 |
if 'category_name' in df_copy.columns and 'follower_count_type' in df_copy.columns:
|
494 |
-
# Create a proper date column for date-based queries
|
495 |
def extract_date_from_category(row):
|
496 |
if row.get('follower_count_type') == 'follower_gains_monthly':
|
497 |
category_name = str(row.get('category_name', ''))
|
498 |
-
# Check if it matches YYYY-MM-DD format
|
499 |
import re
|
500 |
date_pattern = r'^\d{4}-\d{2}-\d{2}$'
|
501 |
if re.match(date_pattern, category_name):
|
502 |
return category_name
|
503 |
return None
|
504 |
|
505 |
-
# Add extracted_date column for cleaner date operations
|
506 |
df_copy['extracted_date'] = df_copy.apply(extract_date_from_category, axis=1)
|
507 |
-
|
508 |
-
# Convert extracted_date to proper datetime type and handle nulls
|
509 |
df_copy['extracted_date'] = pd.to_datetime(df_copy['extracted_date'], errors='coerce')
|
510 |
|
511 |
-
# Create additional helper columns for better analysis
|
512 |
monthly_mask = df_copy['follower_count_type'] == 'follower_gains_monthly'
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
#
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
# Handle common data quality issues
|
539 |
-
# Convert object columns that should be numeric
|
540 |
-
for col in df_processed.columns:
|
541 |
-
if df_processed[col].dtype == 'object':
|
542 |
-
# Try to convert to numeric if it looks like numbers
|
543 |
-
if df_processed[col].astype(str).str.match(r'^\d+\.?\d*$').any():
|
544 |
-
df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
|
545 |
|
546 |
-
#
|
547 |
-
|
548 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
549 |
|
550 |
-
|
551 |
-
text_columns =
|
552 |
-
df_processed[text_columns] = df_processed[text_columns].fillna('')
|
553 |
|
554 |
-
self.all_dataframes
|
|
|
|
|
|
|
|
|
|
|
|
|
555 |
|
556 |
|
557 |
def _build_system_prompt(self) -> str:
|
|
|
483 |
"""Enhanced preprocessing to handle date casting issues and ensure chart generation"""
|
484 |
if not self.all_dataframes:
|
485 |
return
|
486 |
+
|
487 |
+
dataframes_to_add = {} # To store newly created dataframes
|
488 |
+
|
489 |
+
# Iterate over a copy of the items to avoid runtime errors if modifying the dict
|
490 |
+
for name, df_original in list(self.all_dataframes.items()):
|
491 |
+
df_copy = df_original.copy() # Work on a copy for this iteration step
|
492 |
+
|
493 |
if name.lower() in ['follower_stats', 'followers']:
|
|
|
|
|
|
|
494 |
# Handle category_name column that contains dates for follower_gains_monthly
|
495 |
if 'category_name' in df_copy.columns and 'follower_count_type' in df_copy.columns:
|
|
|
496 |
def extract_date_from_category(row):
|
497 |
if row.get('follower_count_type') == 'follower_gains_monthly':
|
498 |
category_name = str(row.get('category_name', ''))
|
|
|
499 |
import re
|
500 |
date_pattern = r'^\d{4}-\d{2}-\d{2}$'
|
501 |
if re.match(date_pattern, category_name):
|
502 |
return category_name
|
503 |
return None
|
504 |
|
|
|
505 |
df_copy['extracted_date'] = df_copy.apply(extract_date_from_category, axis=1)
|
|
|
|
|
506 |
df_copy['extracted_date'] = pd.to_datetime(df_copy['extracted_date'], errors='coerce')
|
507 |
|
|
|
508 |
monthly_mask = df_copy['follower_count_type'] == 'follower_gains_monthly'
|
509 |
+
# Ensure extracted_date is not NaT before strftime
|
510 |
+
valid_dates_mask = monthly_mask & df_copy['extracted_date'].notna()
|
511 |
+
|
512 |
+
df_copy.loc[valid_dates_mask, 'date_for_analysis'] = df_copy.loc[valid_dates_mask, 'extracted_date']
|
513 |
+
df_copy.loc[valid_dates_mask, 'year_month'] = df_copy.loc[valid_dates_mask, 'extracted_date'].dt.strftime('%Y-%m')
|
514 |
+
df_copy.loc[valid_dates_mask, 'month_name'] = df_copy.loc[valid_dates_mask, 'extracted_date'].dt.strftime('%B %Y')
|
515 |
+
|
516 |
+
if 'follower_count' in df_copy.columns:
|
517 |
+
df_copy['follower_count'] = pd.to_numeric(df_copy['follower_count'], errors='coerce')
|
518 |
+
# df_copy['follower_count'] = df_copy['follower_count'].fillna(0) # Moved to general fillna
|
519 |
+
|
520 |
+
# Create separate monthly gains dataframe for easier analysis
|
521 |
+
if 'follower_count_type' in df_copy.columns and 'extracted_date' in df_copy.columns:
|
522 |
+
monthly_gains_df = df_copy[df_copy['follower_count_type'] == 'follower_gains_monthly'].copy()
|
523 |
+
if not monthly_gains_df.empty:
|
524 |
+
monthly_gains_df = monthly_gains_df.dropna(subset=['extracted_date'])
|
525 |
+
if not monthly_gains_df.empty: # Check again after dropna
|
526 |
+
monthly_gains_df = monthly_gains_df.sort_values('extracted_date')
|
527 |
+
# Store in the temporary dictionary
|
528 |
+
dataframes_to_add[f'{name}_monthly_gains'] = monthly_gains_df
|
529 |
+
logging.info(f"Created '{name}_monthly_gains' with {len(monthly_gains_df)} records.")
|
530 |
+
|
531 |
+
# Update the main dataframe in self.all_dataframes with these specific changes
|
532 |
+
self.all_dataframes[name] = df_copy.copy() # Save the processed df_copy
|
533 |
+
logging.info(f"Preprocessed '{name}' dataframe for date handling.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
534 |
|
535 |
+
# General preprocessing for the current dataframe (df_copy or df_original if not 'follower_stats')
|
536 |
+
# Fetch the potentially modified df_copy if it was processed above, otherwise use original df for this iteration
|
537 |
+
current_df_to_process = self.all_dataframes[name].copy()
|
538 |
+
|
539 |
+
# Convert object columns that look numeric
|
540 |
+
for col in current_df_to_process.columns:
|
541 |
+
if current_df_to_process[col].dtype == 'object':
|
542 |
+
try:
|
543 |
+
# Attempt conversion if a good portion of non-null values match numeric pattern
|
544 |
+
if current_df_to_process[col].str.match(r'^-?\d+\.?\d*$').sum() > len(current_df_to_process[col].dropna()) * 0.5:
|
545 |
+
current_df_to_process[col] = pd.to_numeric(current_df_to_process[col], errors='coerce')
|
546 |
+
logging.info(f"Converted column '{col}' in '{name}' to numeric.")
|
547 |
+
except AttributeError: # Handles cases where .str accessor fails (e.g. column has mixed types like numbers and lists)
|
548 |
+
logging.debug(f"Could not apply .str accessor to column '{col}' in '{name}'. Skipping numeric conversion for it.")
|
549 |
+
|
550 |
+
|
551 |
+
numeric_columns = current_df_to_process.select_dtypes(include=[np.number]).columns
|
552 |
+
current_df_to_process[numeric_columns] = current_df_to_process[numeric_columns].fillna(0)
|
553 |
|
554 |
+
text_columns = current_df_to_process.select_dtypes(include=['object']).columns
|
555 |
+
current_df_to_process[text_columns] = current_df_to_process[text_columns].fillna('')
|
|
|
556 |
|
557 |
+
# Update self.all_dataframes with the fully processed version for this key
|
558 |
+
self.all_dataframes[name] = current_df_to_process
|
559 |
+
|
560 |
+
# After the loop, add all newly created dataframes
|
561 |
+
if dataframes_to_add:
|
562 |
+
self.all_dataframes.update(dataframes_to_add)
|
563 |
+
logging.info(f"Added new derived dataframes: {list(dataframes_to_add.keys())}")
|
564 |
|
565 |
|
566 |
def _build_system_prompt(self) -> str:
|