Spaces:

GuglielmoTor
/

LinkedinMonitor

Running

App Files Files Community

GuglielmoTor commited on May 27

Commit

77ff02d

verified ·

1 Parent(s): 7be0087

Update eb_agent_module.py

Browse files

Files changed (1) hide show

eb_agent_module.py +59 -50

eb_agent_module.py CHANGED Viewed

@@ -483,75 +483,84 @@ class EmployerBrandingAgent:
         """Enhanced preprocessing to handle date casting issues and ensure chart generation"""
         if not self.all_dataframes:
             return
-        for name, df in self.all_dataframes.items():
             if name.lower() in ['follower_stats', 'followers']:
-                # Create a copy to avoid modifying original data
-                df_copy = df.copy()
                 # Handle category_name column that contains dates for follower_gains_monthly
                 if 'category_name' in df_copy.columns and 'follower_count_type' in df_copy.columns:
-                    # Create a proper date column for date-based queries
                     def extract_date_from_category(row):
                         if row.get('follower_count_type') == 'follower_gains_monthly':
                             category_name = str(row.get('category_name', ''))
-                            # Check if it matches YYYY-MM-DD format
                             import re
                             date_pattern = r'^\d{4}-\d{2}-\d{2}$'
                             if re.match(date_pattern, category_name):
                                 return category_name
                         return None
-                    # Add extracted_date column for cleaner date operations
                     df_copy['extracted_date'] = df_copy.apply(extract_date_from_category, axis=1)
-                    # Convert extracted_date to proper datetime type and handle nulls
                     df_copy['extracted_date'] = pd.to_datetime(df_copy['extracted_date'], errors='coerce')
-                    # Create additional helper columns for better analysis
                     monthly_mask = df_copy['follower_count_type'] == 'follower_gains_monthly'
-                    df_copy.loc[monthly_mask, 'date_for_analysis'] = df_copy.loc[monthly_mask, 'extracted_date']
-                    df_copy.loc[monthly_mask, 'year_month'] = df_copy.loc[monthly_mask, 'extracted_date'].dt.strftime('%Y-%m')
-                    df_copy.loc[monthly_mask, 'month_name'] = df_copy.loc[monthly_mask, 'extracted_date'].dt.strftime('%B %Y')
-                    # Ensure follower_count is numeric and handle nulls
-                    if 'follower_count' in df_copy.columns:
-                        df_copy['follower_count'] = pd.to_numeric(df_copy['follower_count'], errors='coerce')
-                        df_copy['follower_count'] = df_copy['follower_count'].fillna(0)
-                    # Create separate monthly gains dataframe for easier analysis
-                    monthly_gains = df_copy[df_copy['follower_count_type'] == 'follower_gains_monthly'].copy()
-                    if not monthly_gains.empty:
-                        monthly_gains = monthly_gains.dropna(subset=['extracted_date'])
-                        monthly_gains = monthly_gains.sort_values('extracted_date')
-                        # Store as separate dataframe
-                        self.all_dataframes[f'{name}_monthly_gains'] = monthly_gains
-                    # Update the main dataframe
-                    self.all_dataframes[name] = df_copy
-                    logging.info(f"Preprocessed {name} dataframe for date handling. Monthly records: {len(monthly_gains) if not monthly_gains.empty else 0}")
-            # General preprocessing for all dataframes
-            df_processed = self.all_dataframes[name].copy()
-            # Handle common data quality issues
-            # Convert object columns that should be numeric
-            for col in df_processed.columns:
-                if df_processed[col].dtype == 'object':
-                    # Try to convert to numeric if it looks like numbers
-                    if df_processed[col].astype(str).str.match(r'^\d+\.?\d*$').any():
-                        df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
-            # Fill nulls in numeric columns with 0 (for charting)
-            numeric_columns = df_processed.select_dtypes(include=[np.number]).columns
-            df_processed[numeric_columns] = df_processed[numeric_columns].fillna(0)
-            # Fill nulls in text columns with empty string
-            text_columns = df_processed.select_dtypes(include=['object']).columns
-            df_processed[text_columns] = df_processed[text_columns].fillna('')
-            self.all_dataframes[name] = df_processed
     def _build_system_prompt(self) -> str:

         """Enhanced preprocessing to handle date casting issues and ensure chart generation"""
         if not self.all_dataframes:
             return
+        dataframes_to_add = {} # To store newly created dataframes
+        # Iterate over a copy of the items to avoid runtime errors if modifying the dict
+        for name, df_original in list(self.all_dataframes.items()):
+            df_copy = df_original.copy() # Work on a copy for this iteration step
             if name.lower() in ['follower_stats', 'followers']:
                 # Handle category_name column that contains dates for follower_gains_monthly
                 if 'category_name' in df_copy.columns and 'follower_count_type' in df_copy.columns:
                     def extract_date_from_category(row):
                         if row.get('follower_count_type') == 'follower_gains_monthly':
                             category_name = str(row.get('category_name', ''))
                             import re
                             date_pattern = r'^\d{4}-\d{2}-\d{2}$'
                             if re.match(date_pattern, category_name):
                                 return category_name
                         return None
                     df_copy['extracted_date'] = df_copy.apply(extract_date_from_category, axis=1)
                     df_copy['extracted_date'] = pd.to_datetime(df_copy['extracted_date'], errors='coerce')
                     monthly_mask = df_copy['follower_count_type'] == 'follower_gains_monthly'
+                    # Ensure extracted_date is not NaT before strftime
+                    valid_dates_mask = monthly_mask & df_copy['extracted_date'].notna()
+                    df_copy.loc[valid_dates_mask, 'date_for_analysis'] = df_copy.loc[valid_dates_mask, 'extracted_date']
+                    df_copy.loc[valid_dates_mask, 'year_month'] = df_copy.loc[valid_dates_mask, 'extracted_date'].dt.strftime('%Y-%m')
+                    df_copy.loc[valid_dates_mask, 'month_name'] = df_copy.loc[valid_dates_mask, 'extracted_date'].dt.strftime('%B %Y')
+                if 'follower_count' in df_copy.columns:
+                    df_copy['follower_count'] = pd.to_numeric(df_copy['follower_count'], errors='coerce')
+                    # df_copy['follower_count'] = df_copy['follower_count'].fillna(0) # Moved to general fillna
+                # Create separate monthly gains dataframe for easier analysis
+                if 'follower_count_type' in df_copy.columns and 'extracted_date' in df_copy.columns:
+                    monthly_gains_df = df_copy[df_copy['follower_count_type'] == 'follower_gains_monthly'].copy()
+                    if not monthly_gains_df.empty:
+                        monthly_gains_df = monthly_gains_df.dropna(subset=['extracted_date'])
+                        if not monthly_gains_df.empty: # Check again after dropna
+                            monthly_gains_df = monthly_gains_df.sort_values('extracted_date')
+                            # Store in the temporary dictionary
+                            dataframes_to_add[f'{name}_monthly_gains'] = monthly_gains_df
+                            logging.info(f"Created '{name}_monthly_gains' with {len(monthly_gains_df)} records.")
+                # Update the main dataframe in self.all_dataframes with these specific changes
+                self.all_dataframes[name] = df_copy.copy() # Save the processed df_copy
+                logging.info(f"Preprocessed '{name}' dataframe for date handling.")
+            # General preprocessing for the current dataframe (df_copy or df_original if not 'follower_stats')
+            # Fetch the potentially modified df_copy if it was processed above, otherwise use original df for this iteration
+            current_df_to_process = self.all_dataframes[name].copy()
+            # Convert object columns that look numeric
+            for col in current_df_to_process.columns:
+                if current_df_to_process[col].dtype == 'object':
+                    try:
+                        # Attempt conversion if a good portion of non-null values match numeric pattern
+                        if current_df_to_process[col].str.match(r'^-?\d+\.?\d*$').sum() > len(current_df_to_process[col].dropna()) * 0.5:
+                             current_df_to_process[col] = pd.to_numeric(current_df_to_process[col], errors='coerce')
+                             logging.info(f"Converted column '{col}' in '{name}' to numeric.")
+                    except AttributeError: # Handles cases where .str accessor fails (e.g. column has mixed types like numbers and lists)
+                        logging.debug(f"Could not apply .str accessor to column '{col}' in '{name}'. Skipping numeric conversion for it.")
+            numeric_columns = current_df_to_process.select_dtypes(include=[np.number]).columns
+            current_df_to_process[numeric_columns] = current_df_to_process[numeric_columns].fillna(0)
+            text_columns = current_df_to_process.select_dtypes(include=['object']).columns
+            current_df_to_process[text_columns] = current_df_to_process[text_columns].fillna('')
+            # Update self.all_dataframes with the fully processed version for this key
+            self.all_dataframes[name] = current_df_to_process
+        # After the loop, add all newly created dataframes
+        if dataframes_to_add:
+            self.all_dataframes.update(dataframes_to_add)
+            logging.info(f"Added new derived dataframes: {list(dataframes_to_add.keys())}")
     def _build_system_prompt(self) -> str: