Spaces:

GuglielmoTor
/

LinkedinMonitor

Running

App Files Files Community

GuglielmoTor commited on May 27

Commit

7be0087

verified ·

1 Parent(s): cf1cd44

Update eb_agent_module.py

Browse files

Files changed (1) hide show

eb_agent_module.py +184 -87

eb_agent_module.py CHANGED Viewed

@@ -355,41 +355,48 @@ class EmployerBrandingAgent:
         logging.info(f"EnhancedEmployerBrandingAgent initialized. LLM: {self.llm_model_name}. RAG docs: {len(self.rag_system.documents_df)}. DataFrames: {list(self.all_dataframes.keys())}")
     def _initialize_pandas_agent(self):
-        """Initialize PandasAI with enhanced configuration"""
         if not self.all_dataframes or not GEMINI_API_KEY:
             logging.warning("Cannot initialize PandasAI agent: missing dataframes or API key")
             return
         self._preprocess_dataframes_for_pandas_ai()
         try:
             # Configure LiteLLM with Gemini
             llm = LiteLLM(
-                model="gemini/gemini-2.5-flash-preview-05-20",  # Use gemini/ prefix for Gemini API
                 api_key=GEMINI_API_KEY
             )
-            # Set PandasAI configuration
             pai.config.set({
-            "llm": llm,
-            "temperature": 0.7,
-            "verbose": True,
-            "enable_cache": True,
-            "save_charts": True,  # Enable chart saving
-            "save_charts_path": "./charts",  # Directory to save charts
-            "open_charts": False,  # Don't auto-open charts in browser
-            "custom_whitelisted_dependencies": ["matplotlib", "seaborn", "plotly"]  # Allow plotting libraries
             })
-            # Store dataframes for chat queries (we'll use them directly)
             self.pandas_dfs = {}
             for name, df in self.all_dataframes.items():
-                # Convert to PandasAI DataFrame with description
                 df_description = self._generate_dataframe_description(name, df)
                 pandas_df = pai.DataFrame(df, description=df_description)
                 self.pandas_dfs[name] = pandas_df
-            self.pandas_agent = True  # Flag to indicate PandasAI is ready
             logging.info(f"PandasAI initialized successfully with {len(self.pandas_dfs)} DataFrames")
         except Exception as e:
@@ -398,7 +405,7 @@ class EmployerBrandingAgent:
             self.pandas_dfs = {}
     def _generate_dataframe_description(self, name: str, df: pd.DataFrame) -> str:
-        """Generate a descriptive summary for PandasAI to better understand the data"""
         description_parts = [f"This is the '{name}' dataset containing {len(df)} records."]
         # Add column descriptions based on common patterns
@@ -421,12 +428,21 @@ class EmployerBrandingAgent:
         if column_descriptions:
             description_parts.append("Key columns: " + "; ".join(column_descriptions))
-        # Add specific context for employer branding
-        # Special handling for follower_stats
         if name.lower() in ['follower_stats', 'followers']:
-            description_parts.append("This data tracks LinkedIn company page follower growth and demographics. For monthly growth data, use the 'extracted_date' column for date-based queries instead of trying to cast 'category_name' as a date.")
-            if 'extracted_date' in df.columns:
-                description_parts.append("The 'extracted_date' column contains properly formatted dates (YYYY-MM-DD) extracted from category_name for follower_gains_monthly records.")
         elif name.lower() in ['posts', 'post_stats']:
             description_parts.append("This data contains LinkedIn post performance metrics for employer branding content analysis.")
         elif name.lower() in ['mentions', 'brand_mentions']:
@@ -464,7 +480,7 @@ class EmployerBrandingAgent:
         return get_all_schemas_representation(self.all_dataframes)
     def _preprocess_dataframes_for_pandas_ai(self):
-        """Preprocess dataframes to handle date casting issues before PandasAI analysis"""
         if not self.all_dataframes:
             return
@@ -489,10 +505,54 @@ class EmployerBrandingAgent:
                     # Add extracted_date column for cleaner date operations
                     df_copy['extracted_date'] = df_copy.apply(extract_date_from_category, axis=1)
-                    # Update the dataframe in our collection
                     self.all_dataframes[name] = df_copy
-                logging.info(f"Preprocessed {name} dataframe for date handling")
     def _build_system_prompt(self) -> str:
         """Enhanced system prompt that works with PandasAI integration"""
@@ -582,73 +642,110 @@ class EmployerBrandingAgent:
 # Replace the _generate_pandas_response method and everything after it with this properly indented code:
-    async def _generate_pandas_response(self, query: str) -> tuple[str, bool]:
-        """Generate response using PandasAI for data queries"""
-        if not self.pandas_agent or not hasattr(self, 'pandas_dfs'):
-            return "Data analysis not available - PandasAI not initialized.", False
-        try:
-            logging.info(f"Processing data query with PandasAI: {query[:100]}...")
-            # Clear any existing matplotlib figures to avoid conflicts
-            import matplotlib.pyplot as plt
-            plt.clf()
-            plt.close('all')
-            # Use the first available dataframe for single-df queries
-            if len(self.pandas_dfs) == 1:
-                df = list(self.pandas_dfs.values())[0]
-                logging.info(f"Using single DataFrame for query with shape: {df.df.shape}")
-                pandas_response = df.chat(query)
-            else:
-                # For multiple dataframes, use pai.chat with all dfs
-                dfs = list(self.pandas_dfs.values())
-                pandas_response = pai.chat(query, *dfs)
-            # Handle different response types
-            response_text = ""
-            chart_info = ""
-            # Check if response is a plot path or contains plot information
-            if isinstance(pandas_response, str) and pandas_response.endswith(('.png', '.jpg', '.jpeg', '.svg')):
-                # Response is a chart path
-                chart_info = f"\n\n📊 **Chart Generated**: {os.path.basename(pandas_response)}\nChart saved at: {pandas_response}"
-                response_text = "Analysis completed with visualization"
-                logging.info(f"Chart generated: {pandas_response}")
-            elif hasattr(pandas_response, 'plot_path') and pandas_response.plot_path:
-                # Response object has plot path
-                chart_info = f"\n\n📊 **Chart Generated**: {os.path.basename(pandas_response.plot_path)}\nChart saved at: {pandas_response.plot_path}"
-                response_text = getattr(pandas_response, 'text', str(pandas_response))
-                logging.info(f"Chart generated: {pandas_response.plot_path}")
-            else:
-                # Check for any new chart files in the charts directory
-                if os.path.exists(self.charts_dir):
-                    chart_files = [f for f in os.listdir(self.charts_dir) if f.endswith(('.png', '.jpg', '.jpeg', '.svg'))]
-                    if chart_files:
-                        # Get the most recent chart file
-                        chart_files.sort(key=lambda x: os.path.getmtime(os.path.join(self.charts_dir, x)), reverse=True)
-                        latest_chart = chart_files[0]
-                        chart_path = os.path.join(self.charts_dir, latest_chart)
-                        # Check if this chart was created in the last 30 seconds (likely from this query)
-                        import time
-                        if time.time() - os.path.getmtime(chart_path) < 30:
-                            chart_info = f"\n\n📊 **Chart Generated**: {latest_chart}\nChart saved at: {chart_path}"
-                            logging.info(f"Chart generated: {chart_path}")
-                # Handle text response
-                if pandas_response and str(pandas_response).strip():
-                    response_text = str(pandas_response).strip()
-                else:
-                    response_text = "Analysis completed"
-            final_response = response_text + chart_info
-            return final_response, True
-        except Exception as e:
-            logging.error(f"Error in PandasAI processing: {e}", exc_info=True)
-            # Try to provide a more helpful error message
-            if "Invalid output" in str(e) and "plot save path" in str(e):
-                return "I tried to create a visualization but encountered a formatting issue. Please try rephrasing your request or ask for specific data without requesting a chart.", False
             return f"Error processing data query: {str(e)}", False
     async def _generate_enhanced_response(self, query: str, pandas_result: str = "", query_type: str = "general") -> str:

         logging.info(f"EnhancedEmployerBrandingAgent initialized. LLM: {self.llm_model_name}. RAG docs: {len(self.rag_system.documents_df)}. DataFrames: {list(self.all_dataframes.keys())}")
     def _initialize_pandas_agent(self):
+        """Initialize PandasAI with enhanced configuration for chart generation"""
         if not self.all_dataframes or not GEMINI_API_KEY:
             logging.warning("Cannot initialize PandasAI agent: missing dataframes or API key")
             return
         self._preprocess_dataframes_for_pandas_ai()
         try:
             # Configure LiteLLM with Gemini
             llm = LiteLLM(
+                model="gemini/gemini-2.5-flash-preview-05-20",
                 api_key=GEMINI_API_KEY
             )
+            # Enhanced PandasAI configuration for better chart generation
             pai.config.set({
+                "llm": llm,
+                "temperature": 0.3,  # Lower temperature for more consistent results
+                "verbose": True,
+                "enable_cache": False,  # Disable cache to avoid stale results
+                "save_charts": True,
+                "save_charts_path": "./charts",
+                "open_charts": False,
+                "custom_whitelisted_dependencies": [
+                    "matplotlib", "seaborn", "plotly", "pandas", "numpy"
+                ],
+                "max_retries": 3,  # Add retry logic
+                "use_error_correction_framework": True  # Enable error correction
             })
+            # Store dataframes for chat queries
             self.pandas_dfs = {}
             for name, df in self.all_dataframes.items():
+                # Skip empty dataframes
+                if df.empty:
+                    continue
                 df_description = self._generate_dataframe_description(name, df)
                 pandas_df = pai.DataFrame(df, description=df_description)
                 self.pandas_dfs[name] = pandas_df
+            self.pandas_agent = True
             logging.info(f"PandasAI initialized successfully with {len(self.pandas_dfs)} DataFrames")
         except Exception as e:
             self.pandas_dfs = {}
     def _generate_dataframe_description(self, name: str, df: pd.DataFrame) -> str:
+        """Enhanced dataframe description for better PandasAI understanding"""
         description_parts = [f"This is the '{name}' dataset containing {len(df)} records."]
         # Add column descriptions based on common patterns
         if column_descriptions:
             description_parts.append("Key columns: " + "; ".join(column_descriptions))
+        # Enhanced context for specific datasets
         if name.lower() in ['follower_stats', 'followers']:
+            description_parts.append("""
+            This data tracks LinkedIn company page follower growth and demographics.
+            For monthly growth analysis, use records where follower_count_type='follower_gains_monthly'.
+            The 'extracted_date' column contains properly formatted dates for time series analysis.
+            Use 'year_month' or 'month_name' columns for better date display in charts.
+            For cumulative analysis, use records where follower_count_type='follower_count_cumulative'.
+            """)
+        elif name.lower().endswith('_monthly_gains'):
+            description_parts.append("""
+            This is a filtered dataset containing only monthly follower gains data.
+            All records have valid dates and are sorted chronologically.
+            Use this for creating time series charts of monthly growth patterns.
+            """)
         elif name.lower() in ['posts', 'post_stats']:
             description_parts.append("This data contains LinkedIn post performance metrics for employer branding content analysis.")
         elif name.lower() in ['mentions', 'brand_mentions']:
         return get_all_schemas_representation(self.all_dataframes)
     def _preprocess_dataframes_for_pandas_ai(self):
+        """Enhanced preprocessing to handle date casting issues and ensure chart generation"""
         if not self.all_dataframes:
             return
                     # Add extracted_date column for cleaner date operations
                     df_copy['extracted_date'] = df_copy.apply(extract_date_from_category, axis=1)
+                    # Convert extracted_date to proper datetime type and handle nulls
+                    df_copy['extracted_date'] = pd.to_datetime(df_copy['extracted_date'], errors='coerce')
+                    # Create additional helper columns for better analysis
+                    monthly_mask = df_copy['follower_count_type'] == 'follower_gains_monthly'
+                    df_copy.loc[monthly_mask, 'date_for_analysis'] = df_copy.loc[monthly_mask, 'extracted_date']
+                    df_copy.loc[monthly_mask, 'year_month'] = df_copy.loc[monthly_mask, 'extracted_date'].dt.strftime('%Y-%m')
+                    df_copy.loc[monthly_mask, 'month_name'] = df_copy.loc[monthly_mask, 'extracted_date'].dt.strftime('%B %Y')
+                    # Ensure follower_count is numeric and handle nulls
+                    if 'follower_count' in df_copy.columns:
+                        df_copy['follower_count'] = pd.to_numeric(df_copy['follower_count'], errors='coerce')
+                        df_copy['follower_count'] = df_copy['follower_count'].fillna(0)
+                    # Create separate monthly gains dataframe for easier analysis
+                    monthly_gains = df_copy[df_copy['follower_count_type'] == 'follower_gains_monthly'].copy()
+                    if not monthly_gains.empty:
+                        monthly_gains = monthly_gains.dropna(subset=['extracted_date'])
+                        monthly_gains = monthly_gains.sort_values('extracted_date')
+                        # Store as separate dataframe
+                        self.all_dataframes[f'{name}_monthly_gains'] = monthly_gains
+                    # Update the main dataframe
                     self.all_dataframes[name] = df_copy
+                    logging.info(f"Preprocessed {name} dataframe for date handling. Monthly records: {len(monthly_gains) if not monthly_gains.empty else 0}")
+            # General preprocessing for all dataframes
+            df_processed = self.all_dataframes[name].copy()
+            # Handle common data quality issues
+            # Convert object columns that should be numeric
+            for col in df_processed.columns:
+                if df_processed[col].dtype == 'object':
+                    # Try to convert to numeric if it looks like numbers
+                    if df_processed[col].astype(str).str.match(r'^\d+\.?\d*$').any():
+                        df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
+            # Fill nulls in numeric columns with 0 (for charting)
+            numeric_columns = df_processed.select_dtypes(include=[np.number]).columns
+            df_processed[numeric_columns] = df_processed[numeric_columns].fillna(0)
+            # Fill nulls in text columns with empty string
+            text_columns = df_processed.select_dtypes(include=['object']).columns
+            df_processed[text_columns] = df_processed[text_columns].fillna('')
+            self.all_dataframes[name] = df_processed
     def _build_system_prompt(self) -> str:
         """Enhanced system prompt that works with PandasAI integration"""
 # Replace the _generate_pandas_response method and everything after it with this properly indented code:
+async def _generate_pandas_response(self, query: str) -> tuple[str, bool]:
+    """Generate response using PandasAI with enhanced error handling"""
+    if not self.pandas_agent or not hasattr(self, 'pandas_dfs'):
+        return "Data analysis not available - PandasAI not initialized.", False
+    try:
+        logging.info(f"Processing data query with PandasAI: {query[:100]}...")
+        # Clear any existing matplotlib figures
+        import matplotlib.pyplot as plt
+        plt.clf()
+        plt.close('all')
+        # Enhanced query processing based on content
+        processed_query = query
+        # Add helpful context for common chart requests
+        if any(word in query.lower() for word in ['chart', 'graph', 'plot', 'visualize']):
+            if 'monthly' in query.lower() and 'follower' in query.lower():
+                processed_query += """.
+                Use the monthly gains data (follower_count_type='follower_gains_monthly')
+                and use the extracted_date or month_name column for the x-axis.
+                Make sure to filter out any null dates and sort by date.
+                Create a clear line chart showing the trend over time."""
+            elif 'cumulative' in query.lower() and 'follower' in query.lower():
+                processed_query += """.
+                Use the cumulative data (follower_count_type='follower_count_cumulative')
+                and create a chart showing the total follower growth over time."""
+        # Execute the query
+        if len(self.pandas_dfs) == 1:
+            df = list(self.pandas_dfs.values())[0]
+            logging.info(f"Using single DataFrame for query with shape: {df.df.shape}")
+            pandas_response = df.chat(processed_query)
+        else:
+            dfs = list(self.pandas_dfs.values())
+            pandas_response = pai.chat(processed_query, *dfs)
+        # Enhanced response processing
+        response_text = ""
+        chart_info = ""
+        # Check for chart generation
+        chart_path = None
+        # Method 1: Direct path response
+        if isinstance(pandas_response, str) and pandas_response.endswith(('.png', '.jpg', '.jpeg', '.svg')):
+            chart_path = pandas_response
+            response_text = "Analysis completed with visualization"
+        # Method 2: Response object with plot path
+        elif hasattr(pandas_response, 'plot_path') and pandas_response.plot_path:
+            chart_path = pandas_response.plot_path
+            response_text = getattr(pandas_response, 'text', str(pandas_response))
+        # Method 3: Check charts directory for new files
+        else:
+            if os.path.exists(self.charts_dir):
+                # Get all chart files sorted by modification time
+                chart_files = []
+                for f in os.listdir(self.charts_dir):
+                    if f.endswith(('.png', '.jpg', '.jpeg', '.svg')):
+                        full_path = os.path.join(self.charts_dir, f)
+                        chart_files.append((full_path, os.path.getmtime(full_path)))
+                if chart_files:
+                    # Sort by modification time (newest first)
+                    chart_files.sort(key=lambda x: x[1], reverse=True)
+                    latest_chart_path, latest_time = chart_files[0]
+                    # Check if created in last 60 seconds
+                    import time
+                    if time.time() - latest_time < 60:
+                        chart_path = latest_chart_path
+                        logging.info(f"Found recent chart: {chart_path}")
+            # Handle text response
+            if pandas_response and str(pandas_response).strip():
+                response_text = str(pandas_response).strip()
+            else:
+                response_text = "Analysis completed"
+        # Format final response
+        if chart_path and os.path.exists(chart_path):
+            chart_info = f"\n\n📊 **Chart Generated**: {os.path.basename(chart_path)}\nChart saved at: {chart_path}"
+            logging.info(f"Chart successfully generated: {chart_path}")
+        final_response = response_text + chart_info
+        success = True
+        return final_response, success
+    except Exception as e:
+        logging.error(f"Error in PandasAI processing: {e}", exc_info=True)
+        # Enhanced error handling
+        error_str = str(e).lower()
+        if "matplotlib" in error_str and "none" in error_str:
+            return "I encountered a data visualization error. This might be due to missing or null values in your data. Please try asking for the raw data first, or specify which specific columns you'd like to analyze.", False
+        elif "strftime" in error_str:
+            return "I encountered a date formatting issue. Please try asking for the data without specific date formatting, or ask me to show the raw data structure first.", False
+        elif "ambiguous" in error_str:
+            return "I encountered an ambiguous data type issue. Please try being more specific about which data you'd like to analyze (e.g., 'show monthly follower gains' vs 'show cumulative followers').", False
+        else:
             return f"Error processing data query: {str(e)}", False
     async def _generate_enhanced_response(self, query: str, pandas_result: str = "", query_type: str = "general") -> str: