Spaces:

GuglielmoTor
/

LinkedinMonitor

Running

App Files Files Community

GuglielmoTor commited on May 27

Commit

3ae9352

verified ·

1 Parent(s): 09757d6

Update eb_agent_module.py

Browse files

Files changed (1) hide show

eb_agent_module.py +147 -62

eb_agent_module.py CHANGED Viewed

@@ -177,6 +177,44 @@ class EmployerBrandingAgent:
         # Initialize PandasAI Agent
         self.pandas_agent = None
         self._initialize_pandas_agent()
     def _initialize_pandas_agent(self):
         """Initialize PandasAI with enhanced configuration for chart generation"""
@@ -185,6 +223,7 @@ class EmployerBrandingAgent:
             return
         self._preprocess_dataframes_for_pandas_ai()
         try:
             # Configure LiteLLM with Gemini
@@ -229,48 +268,37 @@ class EmployerBrandingAgent:
             self.pandas_dfs = {}
     def _generate_dataframe_description(self, name: str, df: pd.DataFrame) -> str:
-        """Enhanced dataframe description for better PandasAI understanding"""
         description_parts = [f"This is the '{name}' dataset containing {len(df)} records."]
-        # Add column descriptions based on common patterns
-        column_descriptions = []
-        for col in df.columns:
-            col_lower = col.lower()
-            if 'date' in col_lower:
-                column_descriptions.append(f"'{col}' contains date/time information")
-            elif 'count' in col_lower or 'number' in col_lower:
-                column_descriptions.append(f"'{col}' contains numerical count data")
-            elif 'rate' in col_lower or 'percentage' in col_lower:
-                column_descriptions.append(f"'{col}' contains rate/percentage metrics")
-            elif 'follower' in col_lower:
-                column_descriptions.append(f"'{col}' contains LinkedIn follower data")
-            elif 'engagement' in col_lower:
-                column_descriptions.append(f"'{col}' contains engagement metrics")
-            elif 'post' in col_lower:
-                column_descriptions.append(f"'{col}' contains post-related information")
-        if column_descriptions:
-            description_parts.append("Key columns: " + "; ".join(column_descriptions))
-        # Enhanced context for specific datasets
         if name.lower() in ['follower_stats', 'followers']:
             description_parts.append("""
-            This data tracks LinkedIn company page follower growth and demographics.
-            For monthly growth analysis, use records where follower_count_type='follower_gains_monthly'.
-            The 'extracted_date' column contains properly formatted dates for time series analysis.
-            Use 'year_month' or 'month_name' columns for better date display in charts.
-            For cumulative analysis, use records where follower_count_type='follower_count_cumulative'.
-            """)
-        elif name.lower().endswith('_monthly_gains'):
-            description_parts.append("""
-            This is a filtered dataset containing only monthly follower gains data.
-            All records have valid dates and are sorted chronologically.
-            Use this for creating time series charts of monthly growth patterns.
             """)
-        elif name.lower() in ['posts', 'post_stats']:
-            description_parts.append("This data contains LinkedIn post performance metrics for employer branding content analysis.")
-        elif name.lower() in ['mentions', 'brand_mentions']:
-            description_parts.append("This data tracks brand mentions and sentiment for employer branding reputation analysis.")
         return " ".join(description_parts)
@@ -473,7 +501,7 @@ class EmployerBrandingAgent:
     # Replace the _generate_pandas_response method and everything after it with this properly indented code:
     async def _generate_pandas_response(self, query: str) -> tuple[str, bool]:
-        """Generate response using PandasAI with enhanced error handling"""
         if not self.pandas_agent or not hasattr(self, 'pandas_dfs'):
             return "Data analysis not available - PandasAI not initialized.", False
@@ -485,31 +513,39 @@ class EmployerBrandingAgent:
             plt.clf()
             plt.close('all')
-            # Enhanced query processing based on content
-            processed_query = query
-            # Add helpful context for common chart requests
-            if any(word in query.lower() for word in ['chart', 'graph', 'plot', 'visualize']):
-                if 'monthly' in query.lower() and 'follower' in query.lower():
-                    processed_query += """.
-                    Use the monthly gains data (follower_count_type='follower_gains_monthly')
-                    and use the extracted_date or month_name column for the x-axis.
-                    Make sure to filter out any null dates and sort by date.
-                    Create a clear line chart showing the trend over time."""
-                elif 'cumulative' in query.lower() and 'follower' in query.lower():
-                    processed_query += """.
-                    Use the cumulative data (follower_count_type='follower_count_cumulative')
-                    and create a chart showing the total follower growth over time."""
-            # Execute the query
             pandas_response = None
-            if len(self.pandas_dfs) == 1:
-                df = list(self.pandas_dfs.values())[0]
-                logging.info(f"Using single DataFrame for query with shape: {df.df.shape}")
-                pandas_response = df.chat(processed_query)
-            else:
-                dfs = list(self.pandas_dfs.values())
-                pandas_response = pai.chat(processed_query, *dfs)
             # Enhanced response processing with better type handling
             response_text = ""
@@ -606,6 +642,55 @@ class EmployerBrandingAgent:
             else:
                 return f"Error processing data query: {str(e)}", False
     async def _generate_enhanced_response(self, query: str, pandas_result: str = "", query_type: str = "general") -> str:
         """Generate enhanced response combining PandasAI results with RAG context"""
         if not self.is_ready:

         # Initialize PandasAI Agent
         self.pandas_agent = None
         self._initialize_pandas_agent()
+    def _validate_and_log_data(self):
+        """Validate data quality and log findings"""
+        logging.info("=== DATA VALIDATION REPORT ===")
+        for name, df in self.all_dataframes.items():
+            logging.info(f"\nDataFrame: {name}")
+            logging.info(f"Shape: {df.shape}")
+            logging.info(f"Columns: {list(df.columns)}")
+            # Check for date columns and their ranges
+            date_cols = [col for col in df.columns if 'date' in col.lower()]
+            for date_col in date_cols:
+                if not df[date_col].empty:
+                    try:
+                        date_series = pd.to_datetime(df[date_col], errors='coerce')
+                        valid_dates = date_series.dropna()
+                        if not valid_dates.empty:
+                            min_date = valid_dates.min()
+                            max_date = valid_dates.max()
+                            logging.info(f"  {date_col}: {min_date} to {max_date}")
+                            # Specifically check for 2025 data
+                            dates_2025 = valid_dates[valid_dates.dt.year == 2025]
+                            if not dates_2025.empty:
+                                logging.info(f"  Found {len(dates_2025)} records in 2025")
+                    except Exception as e:
+                        logging.warning(f"  Could not parse dates in {date_col}: {e}")
+            # Check follower data specifically
+            if 'follower' in name.lower():
+                if 'follower_count_type' in df.columns:
+                    type_counts = df['follower_count_type'].value_counts()
+                    logging.info(f"  Follower count types: {dict(type_counts)}")
+                if 'follower_count' in df.columns:
+                    follower_stats = df['follower_count'].describe()
+                    logging.info(f"  Follower count stats: {follower_stats}")
     def _initialize_pandas_agent(self):
         """Initialize PandasAI with enhanced configuration for chart generation"""
             return
         self._preprocess_dataframes_for_pandas_ai()
+        self._validate_and_log_data()
         try:
             # Configure LiteLLM with Gemini
             self.pandas_dfs = {}
     def _generate_dataframe_description(self, name: str, df: pd.DataFrame) -> str:
+        """Enhanced dataframe description with better data context"""
         description_parts = [f"This is the '{name}' dataset containing {len(df)} records."]
+        # Add specific context for follower data
         if name.lower() in ['follower_stats', 'followers']:
             description_parts.append("""
+            CRITICAL DATA STRUCTURE INFO:
+            - Records with follower_count_type='follower_gains_monthly' contain monthly new follower counts
+            - Records with follower_count_type='follower_count_cumulative' contain total follower counts
+            - The 'extracted_date' column contains properly parsed dates for time analysis
+            - For monthly gains analysis, ALWAYS filter by follower_count_type='follower_gains_monthly'
+            - For growth trends, use extracted_date for chronological ordering
+            - The follower_count column contains the actual numeric values to analyze
             """)
+            # Add date range info if available
+            if 'extracted_date' in df.columns:
+                try:
+                    date_col = pd.to_datetime(df['extracted_date'], errors='coerce')
+                    valid_dates = date_col.dropna()
+                    if not valid_dates.empty:
+                        min_date = valid_dates.min()
+                        max_date = valid_dates.max()
+                        description_parts.append(f"Date range: {min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}")
+                        # Highlight 2025 data
+                        dates_2025 = valid_dates[valid_dates.dt.year >= 2025]
+                        if not dates_2025.empty:
+                            description_parts.append(f"Contains {len(dates_2025)} records from 2025 onwards")
+                except Exception as e:
+                    logging.warning(f"Could not analyze date range: {e}")
         return " ".join(description_parts)
     # Replace the _generate_pandas_response method and everything after it with this properly indented code:
     async def _generate_pandas_response(self, query: str) -> tuple[str, bool]:
+        """Generate response using PandasAI with enhanced error handling and data validation"""
         if not self.pandas_agent or not hasattr(self, 'pandas_dfs'):
             return "Data analysis not available - PandasAI not initialized.", False
             plt.clf()
             plt.close('all')
+            # Enhanced query preprocessing
+            processed_query = self._enhance_query_for_pandas(query)
+            logging.info(f"Enhanced query: {processed_query[:200]}...")
+            # Execute the query with better error handling
             pandas_response = None
+            try:
+                if len(self.pandas_dfs) == 1:
+                    df = list(self.pandas_dfs.values())[0]
+                    logging.info(f"Using single DataFrame for query with shape: {df.df.shape}")
+                    pandas_response = df.chat(processed_query)
+                else:
+                    dfs = list(self.pandas_dfs.values())
+                    pandas_response = pai.chat(processed_query, *dfs)
+            except Exception as pandas_error:
+                logging.error(f"PandasAI execution error: {pandas_error}")
+                # Try a simpler version of the query
+                simple_query = self._simplify_query_for_retry(query)
+                if simple_query != query:
+                    logging.info(f"Retrying with simplified query: {simple_query}")
+                    try:
+                        if len(self.pandas_dfs) == 1:
+                            df = list(self.pandas_dfs.values())[0]
+                            pandas_response = df.chat(simple_query)
+                        else:
+                            dfs = list(self.pandas_dfs.values())
+                            pandas_response = pai.chat(simple_query, *dfs)
+                    except Exception as retry_error:
+                        logging.error(f"Retry also failed: {retry_error}")
+                        return f"Data analysis failed: {str(pandas_error)}", False
+                else:
+                    return f"Data analysis failed: {str(pandas_error)}", False
             # Enhanced response processing with better type handling
             response_text = ""
             else:
                 return f"Error processing data query: {str(e)}", False
+    def _enhance_query_for_pandas(self, query: str) -> str:
+        """Enhance query with specific data context and instructions"""
+        enhanced_parts = [query]
+        # Add specific instructions for follower queries
+        if 'follower' in query.lower() and ('gain' in query.lower() or 'growth' in query.lower()):
+            enhanced_parts.append("""
+            IMPORTANT INSTRUCTIONS:
+            - Use only data where follower_count_type='follower_gains_monthly' for monthly gains analysis
+            - Filter out any rows where extracted_date is null or NaT
+            - Sort results by extracted_date in ascending order
+            - For 2025 data, make sure to include all months from January 2025 onwards
+            - Use extracted_date for time series and month_name for better chart labels
+            - Sum the follower_count values to get total gains
+            """)
+        if 'plot' in query.lower() or 'chart' in query.lower():
+            enhanced_parts.append("""
+            CHART REQUIREMENTS:
+            - Create a clear, well-labeled chart
+            - Use appropriate chart type (line chart for time series, bar chart for comparisons)
+            - Include proper axis labels and title
+            - Format dates nicely on x-axis if applicable
+            - Save the chart and return the path
+            """)
+        if '2025' in query:
+            enhanced_parts.append("- Focus specifically on data from 2025 onwards")
+        return " ".join(enhanced_parts)
+    def _simplify_query_for_retry(self, query: str) -> str:
+        """Create a simpler version of the query for retry attempts"""
+        # Remove complex requirements and focus on core request
+        simple_patterns = {
+            r'plot.*followers.*per.*month': 'show follower gains by month',
+            r'how many.*followers.*gain.*since.*2025': 'sum follower gains from 2025',
+            r'chart.*growth': 'show follower growth over time',
+        }
+        query_lower = query.lower()
+        for pattern, replacement in simple_patterns.items():
+            import re
+            if re.search(pattern, query_lower):
+                return replacement
+        return query
     async def _generate_enhanced_response(self, query: str, pandas_result: str = "", query_type: str = "general") -> str:
         """Generate enhanced response combining PandasAI results with RAG context"""
         if not self.is_ready: