Spaces:

GuglielmoTor
/

LinkedinMonitor

Running

App Files Files Community

GuglielmoTor commited on May 23

Commit

97bdf15

verified ·

1 Parent(s): 69061c0

Update eb_agent_module.py

Browse files

Files changed (1) hide show

eb_agent_module.py +472 -250

eb_agent_module.py CHANGED Viewed

@@ -5,63 +5,68 @@ import asyncio
 import logging
 import numpy as np
 import textwrap
 try:
     from google import generativeai as genai
-    from google.generativeai import types as genai_types # For GenerateContentConfig, SafetySetting etc.
-    from google.generativeai.types import HarmCategory, HarmBlockThreshold # Specific enums
 except ImportError:
     logging.error("Google Generative AI library not found. Please install it: pip install google-generativeai", exc_info=True)
-    # Define dummy classes/variables if import fails, so app.py can try to run
-    # (though app.py already has EB_AGENT_AVAILABLE check)
     class genai: Client = None # type: ignore
-    class genai_types: # type: ignore
         EmbedContentConfig = None
         GenerateContentConfig = None
         SafetySetting = None
-    class HarmCategory: # type: ignore
-        HARM_CATEGORY_HATE_SPEECH = "HARM_CATEGORY_HATE_SPEECH"
-        HARM_CATEGORY_HARASSMENT = "HARM_CATEGORY_HARASSMENT"
-        HARM_CATEGORY_SEXUALLY_EXPLICIT = "HARM_CATEGORY_SEXUALLY_EXPLICIT"
-        HARM_CATEGORY_DANGEROUS_CONTENT = "HARM_CATEGORY_DANGEROUS_CONTENT"
-    class HarmBlockThreshold: # type: ignore
-        BLOCK_LOW_AND_ABOVE = "BLOCK_LOW_AND_ABOVE"
-        BLOCK_MEDIUM_AND_ABOVE = "BLOCK_MEDIUM_AND_ABOVE"
-        BLOCK_NONE = "BLOCK_NONE"
 # --- Configuration Constants ---
-# These are defined here because app.py imports them.
-# User should ensure these are appropriate for their needs.
 GEMINI_API_KEY = os.getenv('GEMINI_API_KEY', "")
 if not GEMINI_API_KEY:
     logging.warning("GEMINI_API_KEY environment variable not set. EB Agent will not function.")
-# Model names (as used in app.py imports from this module)
-LLM_MODEL_NAME = "gemini-1.5-flash-latest" # Changed to 1.5-flash as it's generally preferred; user had 2.0-flash. Adjust if needed.
-GEMINI_EMBEDDING_MODEL_NAME = "text-embedding-004" # Common embedding model; user had gemini-embedding-exp-03-07. Adjust if needed.
-# Default Generation Config (app.py imports this as EB_AGENT_GEN_CONFIG)
 GENERATION_CONFIG_PARAMS = {
     "temperature": 0.7,
     "top_p": 0.95,
     "top_k": 40,
     "max_output_tokens": 8192,
-    "candidate_count": 1, # Important for non-streaming
-    # "stop_sequences": [...] # Optional
 }
-# Default Safety Settings (app.py imports this as EB_AGENT_SAFETY_SETTINGS)
 DEFAULT_SAFETY_SETTINGS = [
-    {"category": HarmCategory.HARM_CATEGORY_HATE_SPEECH, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
-    {"category": HarmCategory.HARM_CATEGORY_HARASSMENT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
-    {"category": HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
-    {"category": HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, "threshold": HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE},
 ]
-# Placeholder for RAG documents DataFrame (app.py imports this as eb_agent_default_rag_docs)
-# In a real application, this would be loaded from a file or database.
 df_rag_documents = pd.DataFrame({
     'text': [
         "Employer branding focuses on how an organization is perceived as an employer by potential and current employees.",
@@ -72,12 +77,9 @@ df_rag_documents = pd.DataFrame({
 })
 # --- Client Initialization ---
-# This client will be used by the agent instances.
-# It's initialized once when the module is loaded.
 client = None
-if GEMINI_API_KEY and genai.Client: # Check if genai.Client is not None (due to dummy class on import error)
     try:
-        # genai.configure(api_key=GEMINI_API_KEY) # Alternative: global configuration
         client = genai.Client(api_key=GEMINI_API_KEY)
         logging.info("Google GenAI client initialized successfully.")
     except Exception as e:
@@ -87,43 +89,35 @@ else:
 class AdvancedRAGSystem:
-    """
-    Handles Retrieval Augmented Generation by embedding documents and finding relevant context for queries.
-    """
     def __init__(self, documents_df: pd.DataFrame, embedding_model_name: str):
-        self.documents_df = documents_df.copy() # Work on a copy
         self.embedding_model_name = embedding_model_name
-        self.embeddings: np.ndarray | None = None # Populated by async initialize_embeddings
         logging.info(f"AdvancedRAGSystem initialized with embedding model: {self.embedding_model_name}")
     def _embed_single_document_sync(self, text: str) -> np.ndarray:
-        """Synchronous helper to embed a single piece of text."""
         if not client:
             raise ConnectionError("GenAI client not initialized for RAG embedding.")
-        if not text or not isinstance(text, str): # Basic validation
-            logging.warning("Attempted to embed empty or non-string text. Returning zero vector.")
-            # Attempt to get model's embedding dimension, otherwise use a common default (e.g., 768)
-            # This is tricky without a live model call. For now, let's assume it will be filtered or handled.
-            # If we must return a vector, its dimensionality needs to be known.
-            # For simplicity, errors during embedding will be logged and might lead to skipping the doc.
             raise ValueError("Cannot embed empty or non-string text.")
-        # Using client.models.embed_content as per user's provided snippets
         response = client.models.embed_content(
-            model=self.embedding_model_name, # e.g., "text-embedding-004" or "gemini-embedding-exp-03-07"
-            contents=text, # API takes 'contents' (plural) but can be a single string for single embedding
-            config=genai_types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY") if genai_types.EmbedContentConfig else None
         )
-        # Assuming response.embeddings is the list of floats for a single content string, as per user's snippet.
         return np.array(response.embeddings)
     async def initialize_embeddings(self):
-        """Asynchronously embeds all documents in the documents_df. Should be called once."""
         if self.documents_df.empty:
             logging.info("RAG documents DataFrame is empty. No embeddings to initialize.")
             self.embeddings = np.array([])
             return
         if not client:
             logging.error("GenAI client not available for RAG embedding initialization.")
             self.embeddings = np.array([])
@@ -137,11 +131,10 @@ class AdvancedRAGSystem:
                 logging.warning(f"Skipping document at index {index} due to invalid text: {text_to_embed}")
                 continue
             try:
-                # Wrap the synchronous SDK call in asyncio.to_thread
                 embedding_array = await asyncio.to_thread(self._embed_single_document_sync, text_to_embed)
                 embedded_docs_list.append(embedding_array)
             except Exception as e:
-                logging.error(f"Error embedding document text (index {index}) '{str(text_to_embed)[:50]}...': {e}", exc_info=False) # exc_info=False for brevity in loop
         if not embedded_docs_list:
             self.embeddings = np.array([])
@@ -150,13 +143,20 @@ class AdvancedRAGSystem:
             try:
                 self.embeddings = np.vstack(embedded_docs_list)
                 logging.info(f"Successfully embedded {len(embedded_docs_list)} documents for RAG. Embedding matrix shape: {self.embeddings.shape}")
-            except ValueError as ve: # Handles cases like empty list or inconsistent shapes if errors weren't caught properly
                 logging.error(f"Error stacking embeddings: {ve}. Check individual embedding errors.", exc_info=True)
                 self.embeddings = np.array([])
-    async def retrieve_relevant_info(self, query: str, top_k: int = 3) -> str:
-        """Retrieves relevant document snippets for a given query using vector similarity."""
         if self.embeddings is None or self.embeddings.size == 0 or self.documents_df.empty:
             logging.debug("RAG system not initialized or no documents/embeddings available for retrieval.")
             return ""
@@ -176,257 +176,441 @@ class AdvancedRAGSystem:
         if query_vector.ndim == 0 or query_vector.size == 0:
              logging.warning(f"Query vector embedding failed or is empty for query: {str(query)[:50]}")
              return ""
-        if query_vector.ndim > 1: # Should be 1D
-            query_vector = query_vector.flatten()
         try:
-            # Cosine similarity is dot product of normalized vectors.
-            # For simplicity, using dot product directly. Normalize if true cosine sim is needed.
-            scores = np.dot(self.embeddings, query_vector) # self.embeddings (N, D), query_vector (D,) -> scores (N,)
-            if scores.size == 0:
                 return ""
-            actual_top_k = min(top_k, len(self.documents_df), len(scores))
-            if actual_top_k <= 0: return "" # Ensure top_k is positive
-            # Get indices of top_k scores in descending order
-            top_indices = np.argsort(scores)[-actual_top_k:][::-1]
-            valid_top_indices = [idx for idx in top_indices if 0 <= idx < len(self.documents_df)]
-            if not valid_top_indices: return ""
-            # Retrieve the 'text' field from the original DataFrame
-            context_parts = [self.documents_df.iloc[i]['text'] for i in valid_top_indices if 'text' in self.documents_df.columns]
             context = "\n\n---\n\n".join(context_parts)
             logging.debug(f"Retrieved RAG context for query '{str(query)[:50]}...':\n{context[:200]}...")
             return context
         except Exception as e:
-            logging.error(f"Error during RAG retrieval (dot product/sorting): {e}", exc_info=True)
             return ""
 class EmployerBrandingAgent:
-    """
-    An agent that uses Generative AI to provide insights on employer branding
-    based on provided DataFrames and RAG context.
-    """
     def __init__(self,
                  all_dataframes: dict,
-                 rag_documents_df: pd.DataFrame, # For RAG system
                  llm_model_name: str,
-                 embedding_model_name: str, # For RAG system
                  generation_config_dict: dict,
                  safety_settings_list_of_dicts: list,
-                 # client_instance, # Using global client for simplicity now
-                 force_sandbox: bool = False # Parameter from app.py, currently unused here
-                ):
-        # self.client = client_instance # If client were passed
-        self.all_dataframes = {k: df.copy() for k, df in all_dataframes.items()} # Work with copies
-        self.schemas_representation = self._get_all_schemas_representation() # Sync method
-        self.chat_history = [] # Stores chat in API format: [{"role": "user/model", "parts": [{"text": "..."}]}]
-                               # This will be set by app.py before calling process_query
         self.llm_model_name = llm_model_name
         self.generation_config_dict = generation_config_dict
-        self.safety_settings_list_of_dicts = safety_settings_list_of_dicts
         self.embedding_model_name = embedding_model_name
         self.rag_system = AdvancedRAGSystem(rag_documents_df, self.embedding_model_name)
-        # Note: self.rag_system.initialize_embeddings() must be called externally (e.g., in app.py)
-        self.force_sandbox = force_sandbox # Store if needed for tool use later
         logging.info(f"EmployerBrandingAgent initialized. LLM: {self.llm_model_name}, Embedding: {self.embedding_model_name}. RAG system created.")
-    def _get_all_schemas_representation(self) -> str:
-        """Generates a string representation of the schemas of all DataFrames."""
-        schema_descriptions = ["DataFrames available for analysis:"]
         for key, df in self.all_dataframes.items():
-            df_name = f"df_{key}" # Consistent naming for the agent to refer to
-            columns = ", ".join(df.columns)
-            shape = df.shape
             if df.empty:
-                schema = f"\n--- DataFrame: {df_name} ---\nStatus: Empty\nShape: {shape}\nColumns: {columns}"
-            else:
-                # Basic stats for numeric columns, first few unique for objects
-                sample_info_parts = []
-                for col in df.columns:
-                    if pd.api.types.is_numeric_dtype(df[col]) and not df[col].empty:
-                        sample_info_parts.append(f"{col} (numeric, e.g., mean: {df[col].mean():.2f})")
-                    elif pd.api.types.is_datetime64_any_dtype(df[col]) and not df[col].empty:
-                         sample_info_parts.append(f"{col} (datetime, e.g., min: {df[col].min()}, max: {df[col].max()})")
-                    elif not df[col].empty:
-                        unique_vals = df[col].unique()
-                        display_unique = ', '.join(map(str, unique_vals[:3]))
-                        if len(unique_vals) > 3: display_unique += ", ..."
-                        sample_info_parts.append(f"{col} (object, e.g., {display_unique})")
-                    else:
-                        sample_info_parts.append(f"{col} (empty)")
-                schema = (f"\n--- DataFrame: {df_name} ---\nShape: {shape}\nColumns & Sample Info:\n  " + "\n  ".join(sample_info_parts))
-            schema_descriptions.append(schema)
         return "\n".join(schema_descriptions)
     async def _build_prompt_for_current_turn(self, raw_user_query: str) -> str:
-        """
-        Constructs the full prompt for the current turn, including system instructions,
-        DataFrame schemas, RAG context, and the user's query.
-        """
-        # System instruction part
         prompt_parts = [
             "You are an expert Employer Branding Analyst and a helpful AI assistant. "
             "Your goal is to provide insightful analysis based on the provided LinkedIn data. "
             "When asked to generate Pandas code, ensure it is correct, runnable, and clearly explained. "
-            "When providing insights, be specific and refer to the data where possible."
         ]
-        # Schema information
-        prompt_parts.append("\n\n--- AVAILABLE DATA ---")
         prompt_parts.append(self.schemas_representation)
-        # RAG context
-        if self.rag_system.embeddings is not None and self.rag_system.embeddings.size > 0 : # Check if RAG is initialized
-            logging.debug(f"Retrieving RAG context for query: {raw_user_query[:50]}...")
-            rag_context = await self.rag_system.retrieve_relevant_info(raw_user_query)
-            if rag_context:
-                prompt_parts.append("\n\n--- RELEVANT CONTEXTUAL INFORMATION (from documents) ---")
-                prompt_parts.append(rag_context)
-            else:
-                logging.debug("No relevant RAG context found.")
-        else:
-            logging.debug("RAG system not initialized or embeddings not available, skipping RAG context retrieval.")
-        # User's current query
         prompt_parts.append("\n\n--- USER REQUEST ---")
         prompt_parts.append(f"Based on all the information above, please respond to the following user query:\n{raw_user_query}")
         final_prompt = "\n".join(prompt_parts)
         logging.debug(f"Built prompt for current turn (first 300 chars): {final_prompt[:300]}")
         return final_prompt
-    async def process_query(self, raw_user_query_this_turn: str) -> str:
-        """
-        Processes the user's query, incorporating chat history, DataFrame schemas, and RAG.
-        The agent's self.chat_history is expected to be set by the calling application (app.py)
-        and should contain the history *before* the current raw_user_query_this_turn.
-        This method returns the AI's response string. app.py will then update the agent's
-        chat history with the raw_user_query_this_turn and this response.
-        """
-        if not client:
-            logging.error("GenAI client not initialized. Cannot process query.")
-            return "Error: The AI Agent is not available due to a configuration issue with the AI service."
-        if not raw_user_query_this_turn.strip():
-            return "Please provide a query."
-        # 1. Prepare the augmented prompt for the *current* user query
-        # This prompt includes system instructions, schemas, RAG, and the current raw query.
         augmented_current_user_prompt_text = await self._build_prompt_for_current_turn(raw_user_query_this_turn)
-        # 2. Construct the full list of contents for the API call
-        # self.chat_history should be in API format: [{"role": "user/model", "parts": [{"text": "..."}]}]
-        # It contains history *before* the current raw_user_query_this_turn.
-        api_call_contents = []
-        if self.chat_history: # Add previous turns if any
-            api_call_contents.extend(self.chat_history)
-        # Add the current user turn, using the fully augmented prompt as its content
         api_call_contents.append({"role": "user", "parts": [{"text": augmented_current_user_prompt_text}]})
         logging.debug(f"Sending to GenAI. Total turns in content: {len(api_call_contents)}")
-        if api_call_contents:
-             logging.debug(f"Last turn role: {api_call_contents[-1]['role']}, text start: {api_call_contents[-1]['parts'][0]['text'][:100]}")
-        # 3. Prepare API configuration
-        # Convert safety settings from list of dicts to list of SafetySetting objects if genai_types are available
-        api_safety_settings = []
-        if genai_types.SafetySetting:
             for ss_dict in self.safety_settings_list_of_dicts:
                 try:
-                    api_safety_settings.append(genai_types.SafetySetting(**ss_dict))
-                except TypeError: # Handles if HarmCategory/HarmBlockThreshold were strings due to import error
-                     logging.warning(f"Could not create SafetySetting object from dict: {ss_dict}. Using dict directly.")
-                     api_safety_settings.append(ss_dict) # Fallback to dict
-        else: # genai_types not available
-            api_safety_settings = self.safety_settings_list_of_dicts
-        api_generation_config = None
-        if genai_types.GenerateContentConfig:
-            try:
-                api_generation_config = genai_types.GenerateContentConfig(
-                    **self.generation_config_dict,
-                    safety_settings=api_safety_settings # This should be list of SafetySetting objects or dicts
-                )
-            except TypeError:
-                logging.warning("Could not create GenerateContentConfig object. Using dicts directly for config.")
-                # Fallback: if GenerateContentConfig fails, try to pass dicts (might not be supported by client.models.generate_content's 'config' param)
-                # The user's snippet uses config=types.GenerateContentConfig(...), so this object is important.
-                # If it fails, the call might fail.
-                api_generation_config = self.generation_config_dict # This is not ideal for the 'config' parameter.
-                # The 'config' parameter of client.models.generate_content expects a GenerateContentConfig object.
-                # If we can't create it, we should signal an error or try a different call structure if available.
-                # For now, proceed and let the API call potentially fail if config is malformed.
-                # A better fallback would be to construct the config parts individually if the main object fails.
-                # However, the user's snippet is clear: config=types.GenerateContentConfig(...)
-                # So, if genai_types.GenerateContentConfig is None, this will be an issue.
-        else: # genai_types.GenerateContentConfig is None (likely import error)
-             logging.error("genai_types.GenerateContentConfig not available. Cannot form API config.")
-             return "Error: AI Agent configuration problem (GenerateContentConfig type missing)."
-        # 4. Make the API call (synchronous SDK call wrapped in asyncio.to_thread)
-        try:
-            response = await asyncio.to_thread(
-                client.models.generate_content, # As per user's snippet
-                model=self.llm_model_name,
-                contents=api_call_contents,
-                config=api_generation_config # Pass the GenerateContentConfig object
             )
-            # Extract text. User's snippet uses response.text
-            # Check for blocked content or other issues
-            if not response.candidates:
-                block_reason = response.prompt_feedback.block_reason if response.prompt_feedback else "Unknown"
-                logging.warning(f"AI response blocked or empty. Reason: {block_reason}")
-                # You might want to inspect response.prompt_feedback for block reasons
-                error_message = f"The AI's response was blocked. Reason: {block_reason}."
-                if response.prompt_feedback and response.prompt_feedback.block_reason_message:
-                    error_message += f" Details: {response.prompt_feedback.block_reason_message}"
-                return error_message
-            answer = response.text.strip()
-            logging.info(f"Successfully received AI response (first 100 chars): {answer[:100]}")
-        except Exception as e:
-            logging.error(f"Error during GenAI call: {e}", exc_info=True)
-            # Check if it's a Google specific API error for more details
-            # from google.api_core import exceptions as google_exceptions
-            # if isinstance(e, google_exceptions.GoogleAPIError):
-            #     answer = f"API Error: {e.message}"
-            # else:
-            answer = f"# Error during AI processing:\n{type(e).__name__}: {str(e)}"
-        return answer
-    def clear_chat_history(self): # This method is called by app.py
-        """Clears the agent's internal chat history."""
         self.chat_history = []
         logging.info("EmployerBrandingAgent chat history cleared by request.")
-# --- Module-level function for schema display in app.py ---
 def get_all_schemas_representation(all_dataframes: dict) -> str:
-    """
-    Generates a string representation of the schemas of all DataFrames,
-    intended for display in the Gradio UI.
-    This is a standalone function as it's imported directly by app.py.
-    """
-    if not all_dataframes:
-        return "No DataFrames are currently loaded."
     schema_descriptions = ["DataFrames currently available in the application state:"]
     for key, df in all_dataframes.items():
         df_name = f"df_{key}"
@@ -435,9 +619,47 @@ def get_all_schemas_representation(all_dataframes: dict) -> str:
         if df.empty:
             schema = f"\n--- DataFrame: {df_name} ---\nStatus: Empty\nShape: {shape}\nColumns: {columns}"
         else:
-            # Provide a bit more detail for UI display
-            sample_data_str = df.head(2).to_markdown(index=False) # Use markdown for better UI rendering
             schema = (f"\n--- DataFrame: {df_name} ---\nShape: {shape}\nColumns: {columns}\n\n<details><summary>Sample Data (first 2 rows of {df_name}):</summary>\n\n{sample_data_str}\n\n</details>")
         schema_descriptions.append(schema)
     return "\n".join(schema_descriptions)

 import logging
 import numpy as np
 import textwrap
+from datetime import datetime # Added for date calculations
 try:
     from google import generativeai as genai
+    from google.generativeai import types # For GenerateContentConfig, SafetySetting, HarmCategory, HarmBlockThreshold etc.
 except ImportError:
     logging.error("Google Generative AI library not found. Please install it: pip install google-generativeai", exc_info=True)
+    # Define dummy classes/variables if import fails
     class genai: Client = None # type: ignore
+    class types: # type: ignore
         EmbedContentConfig = None
         GenerateContentConfig = None
         SafetySetting = None
+        # Define HarmCategory and HarmBlockThreshold as inner classes or attributes for the dummy types
+        class HarmCategory: # type: ignore
+            HARM_CATEGORY_HATE_SPEECH = "HARM_CATEGORY_HATE_SPEECH"
+            HARM_CATEGORY_HARASSMENT = "HARM_CATEGORY_HARASSMENT"
+            HARM_CATEGORY_SEXUALLY_EXPLICIT = "HARM_CATEGORY_SEXUALLY_EXPLICIT"
+            HARM_CATEGORY_DANGEROUS_CONTENT = "HARM_CATEGORY_DANGEROUS_CONTENT"
+        class HarmBlockThreshold: # type: ignore
+            BLOCK_LOW_AND_ABOVE = "BLOCK_LOW_AND_ABOVE"
+            BLOCK_MEDIUM_AND_ABOVE = "BLOCK_MEDIUM_AND_ABOVE"
+            BLOCK_NONE = "BLOCK_NONE"
+# --- Custom Exceptions ---
+class ValidationError(Exception):
+    """Custom validation error for agent inputs"""
+    pass
+class RateLimitError(Exception):
+    """Placeholder for rate limit errors."""
+    pass
 # --- Configuration Constants ---
 GEMINI_API_KEY = os.getenv('GEMINI_API_KEY', "")
 if not GEMINI_API_KEY:
     logging.warning("GEMINI_API_KEY environment variable not set. EB Agent will not function.")
+LLM_MODEL_NAME = "gemini-1.5-flash-latest"
+GEMINI_EMBEDDING_MODEL_NAME = "text-embedding-004"
 GENERATION_CONFIG_PARAMS = {
     "temperature": 0.7,
     "top_p": 0.95,
     "top_k": 40,
     "max_output_tokens": 8192,
+    "candidate_count": 1,
 }
+# Updated to use types.HarmCategory and types.HarmBlockThreshold
 DEFAULT_SAFETY_SETTINGS = [
+    {"category": types.HarmCategory.HARM_CATEGORY_HATE_SPEECH if types and hasattr(types, 'HarmCategory') else "HARM_CATEGORY_HATE_SPEECH",
+     "threshold": types.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE if types and hasattr(types, 'HarmBlockThreshold') else "BLOCK_MEDIUM_AND_ABOVE"},
+    {"category": types.HarmCategory.HARM_CATEGORY_HARASSMENT if types and hasattr(types, 'HarmCategory') else "HARM_CATEGORY_HARASSMENT",
+     "threshold": types.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE if types and hasattr(types, 'HarmBlockThreshold') else "BLOCK_MEDIUM_AND_ABOVE"},
+    {"category": types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT if types and hasattr(types, 'HarmCategory') else "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+     "threshold": types.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE if types and hasattr(types, 'HarmBlockThreshold') else "BLOCK_MEDIUM_AND_ABOVE"},
+    {"category": types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT if types and hasattr(types, 'HarmCategory') else "HARM_CATEGORY_DANGEROUS_CONTENT",
+     "threshold": types.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE if types and hasattr(types, 'HarmBlockThreshold') else "BLOCK_MEDIUM_AND_ABOVE"},
 ]
 df_rag_documents = pd.DataFrame({
     'text': [
         "Employer branding focuses on how an organization is perceived as an employer by potential and current employees.",
 })
 # --- Client Initialization ---
 client = None
+if GEMINI_API_KEY and genai.Client:
     try:
         client = genai.Client(api_key=GEMINI_API_KEY)
         logging.info("Google GenAI client initialized successfully.")
     except Exception as e:
 class AdvancedRAGSystem:
     def __init__(self, documents_df: pd.DataFrame, embedding_model_name: str):
+        self.documents_df = documents_df.copy()
         self.embedding_model_name = embedding_model_name
+        self.embeddings: np.ndarray | None = None
         logging.info(f"AdvancedRAGSystem initialized with embedding model: {self.embedding_model_name}")
     def _embed_single_document_sync(self, text: str) -> np.ndarray:
         if not client:
             raise ConnectionError("GenAI client not initialized for RAG embedding.")
+        if not text or not isinstance(text, str):
             raise ValueError("Cannot embed empty or non-string text.")
+        # Ensure types.EmbedContentConfig is available before using it
+        embed_config = None
+        if types and hasattr(types, 'EmbedContentConfig'):
+            embed_config = types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY")
         response = client.models.embed_content(
+            model=self.embedding_model_name,
+            contents=text,
+            config=embed_config
         )
         return np.array(response.embeddings)
     async def initialize_embeddings(self):
         if self.documents_df.empty:
             logging.info("RAG documents DataFrame is empty. No embeddings to initialize.")
             self.embeddings = np.array([])
             return
         if not client:
             logging.error("GenAI client not available for RAG embedding initialization.")
             self.embeddings = np.array([])
                 logging.warning(f"Skipping document at index {index} due to invalid text: {text_to_embed}")
                 continue
             try:
                 embedding_array = await asyncio.to_thread(self._embed_single_document_sync, text_to_embed)
                 embedded_docs_list.append(embedding_array)
             except Exception as e:
+                logging.error(f"Error embedding document text (index {index}) '{str(text_to_embed)[:50]}...': {e}", exc_info=False)
         if not embedded_docs_list:
             self.embeddings = np.array([])
             try:
                 self.embeddings = np.vstack(embedded_docs_list)
                 logging.info(f"Successfully embedded {len(embedded_docs_list)} documents for RAG. Embedding matrix shape: {self.embeddings.shape}")
+            except ValueError as ve:
                 logging.error(f"Error stacking embeddings: {ve}. Check individual embedding errors.", exc_info=True)
                 self.embeddings = np.array([])
+    def _calculate_cosine_similarity(self, embeddings_matrix: np.ndarray, query_vector: np.ndarray) -> np.ndarray:
+        """Calculate normalized cosine similarity between a matrix of embeddings and a query vector."""
+        query_vector = query_vector.flatten()
+        norm_matrix = np.linalg.norm(embeddings_matrix, axis=1, keepdims=True)
+        normalized_embeddings_matrix = embeddings_matrix / (norm_matrix + 1e-8)
+        norm_query = np.linalg.norm(query_vector)
+        normalized_query_vector = query_vector / (norm_query + 1e-8)
+        return np.dot(normalized_embeddings_matrix, normalized_query_vector)
+    async def retrieve_relevant_info(self, query: str, top_k: int = 3, min_similarity: float = 0.3) -> str:
         if self.embeddings is None or self.embeddings.size == 0 or self.documents_df.empty:
             logging.debug("RAG system not initialized or no documents/embeddings available for retrieval.")
             return ""
         if query_vector.ndim == 0 or query_vector.size == 0:
              logging.warning(f"Query vector embedding failed or is empty for query: {str(query)[:50]}")
              return ""
         try:
+            similarity_scores = self._calculate_cosine_similarity(self.embeddings, query_vector)
+            if similarity_scores.size == 0:
                 return ""
+            relevant_indices_after_threshold = np.where(similarity_scores >= min_similarity)[0]
+            if len(relevant_indices_after_threshold) == 0:
+                logging.debug(f"No documents met the minimum similarity threshold of {min_similarity} for query: {query[:50]}")
+                return ""
+            relevant_scores = similarity_scores[relevant_indices_after_threshold]
+            sorted_relevant_indices_local = np.argsort(relevant_scores)[::-1]
+            top_original_indices = relevant_indices_after_threshold[sorted_relevant_indices_local[:top_k]]
+            if len(top_original_indices) == 0: return ""
+            context_parts = [self.documents_df.iloc[i]['text'] for i in top_original_indices if 'text' in self.documents_df.columns]
             context = "\n\n---\n\n".join(context_parts)
             logging.debug(f"Retrieved RAG context for query '{str(query)[:50]}...':\n{context[:200]}...")
             return context
         except Exception as e:
+            logging.error(f"Error during RAG retrieval (similarity/sorting): {e}", exc_info=True)
             return ""
 class EmployerBrandingAgent:
     def __init__(self,
                  all_dataframes: dict,
+                 rag_documents_df: pd.DataFrame,
                  llm_model_name: str,
+                 embedding_model_name: str,
                  generation_config_dict: dict,
                  safety_settings_list_of_dicts: list,
+                 force_sandbox: bool = False):
+        self.all_dataframes = {k: df.copy() for k, df in all_dataframes.items()}
+        self.schemas_representation = self._get_enhanced_schemas_representation()
+        self.chat_history = []
         self.llm_model_name = llm_model_name
         self.generation_config_dict = generation_config_dict
+        self.safety_settings_list_of_dicts = safety_settings_list_of_dicts # These are dicts
         self.embedding_model_name = embedding_model_name
         self.rag_system = AdvancedRAGSystem(rag_documents_df, self.embedding_model_name)
+        self.force_sandbox = force_sandbox
         logging.info(f"EmployerBrandingAgent initialized. LLM: {self.llm_model_name}, Embedding: {self.embedding_model_name}. RAG system created.")
+    def _get_date_range(self, df: pd.DataFrame) -> str:
+        for col in df.columns:
+            if pd.api.types.is_datetime64_any_dtype(df[col]):
+                try:
+                    min_date = df[col].min()
+                    max_date = df[col].max()
+                    if pd.notna(min_date) and pd.notna(max_date):
+                        return f"{min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}"
+                except Exception: pass
+        return "N/A"
+    def _calculate_growth_rate(self, df: pd.DataFrame) -> str:
+        logging.debug("_calculate_growth_rate is a placeholder.") # Changed to debug
+        return "Growth rate calculation not implemented."
+    def _analyze_engagement_trends(self, df: pd.DataFrame) -> str:
+        logging.debug("_analyze_engagement_trends is a placeholder.")
+        return "Engagement trend analysis not implemented."
+    def _analyze_demographics(self, df: pd.DataFrame) -> str:
+        logging.debug("_analyze_demographics is a placeholder.")
+        return "Demographic analysis not implemented."
+    def _analyze_post_performance(self, df: pd.DataFrame) -> str:
+        logging.debug("_analyze_post_performance is a placeholder.")
+        return "Post performance analysis not implemented."
+    def _extract_content_themes(self, df: pd.DataFrame) -> str:
+        logging.debug("_extract_content_themes is a placeholder.")
+        return "Content theme extraction not implemented."
+    def _find_optimal_times(self, df: pd.DataFrame) -> str:
+        logging.debug("_find_optimal_times is a placeholder.")
+        return "Optimal posting time analysis not implemented."
+    def _calculate_key_metrics(self, df: pd.DataFrame, df_type: str) -> dict:
+        metrics = {}
+        if 'follower' in df_type.lower():
+            metrics.update({
+                'follower_growth_rate': self._calculate_growth_rate(df),
+                'engagement_trends': self._analyze_engagement_trends(df),
+                'demographic_distribution': self._analyze_demographics(df)
+            })
+        elif 'post' in df_type.lower():
+            metrics.update({
+                'post_performance': self._analyze_post_performance(df),
+                'content_themes': self._extract_content_themes(df),
+                'optimal_posting_times': self._find_optimal_times(df)
+            })
+        elif 'mention' in df_type.lower():
+            metrics['mention_volume_trend'] = "Mention volume trend not implemented."
+            metrics['mention_sentiment_overview'] = "Mention sentiment overview not implemented."
+        if not metrics:
+            logging.debug(f"No specific key metrics defined for df_type: {df_type}")
+            return {"info": "Standard metrics applicable."}
+        return metrics
+    def _calculate_data_freshness(self, df: pd.DataFrame) -> str:
+        for col in df.columns:
+            if pd.api.types.is_datetime64_any_dtype(df[col]):
+                try:
+                    max_date = df[col].max()
+                    if pd.notna(max_date):
+                        days_diff = (datetime.now(max_date.tzinfo) - max_date).days # tz aware
+                        return f"Data up to {max_date.strftime('%Y-%m-%d')} ({days_diff} days old)"
+                except Exception: pass
+        return "Freshness N/A (no clear date column)"
+    def _check_data_consistency(self, df: pd.DataFrame) -> str:
+        logging.debug("_check_data_consistency is a placeholder.")
+        return "Consistency checks not implemented."
+    def _identify_accuracy_issues(self, df: pd.DataFrame) -> str:
+        logging.debug("_identify_accuracy_issues is a placeholder.")
+        return "Accuracy issue identification not implemented."
+    def _assess_data_quality(self, df: pd.DataFrame) -> dict:
+        completeness = (1 - (df.isnull().sum().sum() / (len(df) * len(df.columns)))) if len(df) > 0 and len(df.columns) > 0 else 0
+        return {
+            'completeness_score': f"{completeness:.2%}",
+            'freshness_info': self._calculate_data_freshness(df),
+            'consistency_check': self._check_data_consistency(df),
+            'accuracy_flags_summary': self._identify_accuracy_issues(df),
+            'sample_size_notes': f"{len(df)} records. {'Adequate for basic analysis.' if len(df) >= 100 else 'Limited sample size; insights may be indicative.'}"
+        }
+    def _identify_patterns(self, df: pd.DataFrame, key: str) -> str:
+        logging.debug(f"_identify_patterns for {key} is a placeholder.")
+        return "Pattern identification not implemented."
+    def _format_df_analysis(self, df_key: str, analysis: dict) -> str:
+        formatted_parts = [f"\n--- DataFrame: df_{df_key} ---"]
+        formatted_parts.append(f"  Shape: {analysis['shape']}")
+        formatted_parts.append(f"  Date Range: {analysis['date_range']}")
+        formatted_parts.append("  Key Metrics:")
+        for metric, value in analysis['key_metrics'].items():
+            formatted_parts.append(f"    - {metric.replace('_', ' ').title()}: {value}")
+        formatted_parts.append("  Data Quality Assessment:")
+        for aspect, value in analysis['data_quality'].items():
+            formatted_parts.append(f"    - {aspect.replace('_', ' ').title()}: {value}")
+        formatted_parts.append(f"  Notable Patterns: {analysis['notable_patterns']}")
+        return "\n".join(formatted_parts)
+    def _get_enhanced_schemas_representation(self) -> str:
+        schema_descriptions = ["=== DETAILED LINKEDIN DATA OVERVIEW ==="]
+        if not self.all_dataframes:
+             schema_descriptions.append("No dataframes available for analysis.")
+             return "\n".join(schema_descriptions)
         for key, df in self.all_dataframes.items():
             if df.empty:
+                schema_descriptions.append(f"\n--- DataFrame: df_{key} ---\nStatus: Empty. No analysis possible.")
+                continue
+            analysis = {
+                'shape': df.shape,
+                'date_range': self._get_date_range(df),
+                'key_metrics': self._calculate_key_metrics(df, key),
+                'data_quality': self._assess_data_quality(df),
+                'notable_patterns': self._identify_patterns(df, key)
+            }
+            schema_descriptions.append(self._format_df_analysis(key, analysis))
         return "\n".join(schema_descriptions)
+    def _extract_query_intent(self, query: str) -> str:
+        logging.debug("_extract_query_intent is a placeholder.")
+        if "compare" in query.lower() or "benchmark" in query.lower(): return "comparison"
+        if "trend" in query.lower(): return "trend_analysis"
+        return "general"
+    async def _get_business_context(self, intent: str) -> str:
+        logging.debug("_get_business_context is a placeholder.")
+        if intent == "comparison": return "Company is focused on outperforming competitors in tech hiring."
+        return "Company aims to improve overall employer brand perception."
+    async def _get_industry_benchmarks(self, intent: str) -> str:
+        logging.debug("_get_industry_benchmarks is a placeholder.")
+        if intent == "trend_analysis": return "Typical follower growth in this sector is 5-10% MoM."
+        return "Average engagement rate for similar companies is 2-3%."
+    async def _enhance_rag_context(self, query: str, base_context: str) -> str:
+        intent = self._extract_query_intent(query)
+        business_context_val = await self._get_business_context(intent)
+        benchmarks_val = await self._get_industry_benchmarks(intent)
+        enhanced_context = f"""{base_context}
+--- ADDITIONAL CONTEXT FOR YOUR ANALYSIS ---
+Business Focus: {business_context_val}
+Relevant Benchmarks: {benchmarks_val}"""
+        return enhanced_context
     async def _build_prompt_for_current_turn(self, raw_user_query: str) -> str:
         prompt_parts = [
             "You are an expert Employer Branding Analyst and a helpful AI assistant. "
             "Your goal is to provide insightful analysis based on the provided LinkedIn data. "
             "When asked to generate Pandas code, ensure it is correct, runnable, and clearly explained. "
+            "When providing insights, be specific and refer to the data where possible. "
+            "Use the detailed data overview and any contextual information provided."
         ]
+        prompt_parts.append("\n\n--- DETAILED DATA OVERVIEW ---")
         prompt_parts.append(self.schemas_representation)
+        if self.rag_system.embeddings is not None and self.rag_system.embeddings.size > 0:
+            logging.debug(f"Retrieving base RAG context for query: {raw_user_query[:50]}...")
+            base_rag_context = await self.rag_system.retrieve_relevant_info(raw_user_query)
+            if base_rag_context:
+                logging.debug(f"Enhancing RAG context for query: {raw_user_query[:50]}...")
+                enhanced_rag_context = await self._enhance_rag_context(raw_user_query, base_rag_context)
+                prompt_parts.append("\n\n--- RELEVANT CONTEXTUAL INFORMATION (from documents & business knowledge) ---")
+                prompt_parts.append(enhanced_rag_context)
+            else: logging.debug("No base RAG context found.")
+        else: logging.debug("RAG system not initialized or embeddings not available, skipping RAG context retrieval.")
         prompt_parts.append("\n\n--- USER REQUEST ---")
         prompt_parts.append(f"Based on all the information above, please respond to the following user query:\n{raw_user_query}")
         final_prompt = "\n".join(prompt_parts)
         logging.debug(f"Built prompt for current turn (first 300 chars): {final_prompt[:300]}")
         return final_prompt
+    async def _process_structured_query(self, prompt: str) -> dict:
+        logging.debug("_process_structured_query is a placeholder. Returning dummy structure.")
+        return {
+            "Key Findings": ["Placeholder finding 1", "Placeholder finding 2"],
+            "Performance Metrics": ["Placeholder metric performance"],
+            "Actionable Recommendations": {
+                "Immediate Actions (0-30 days)": ["Placeholder immediate action"],
+                "Short-term Strategy (1-3 months)": ["Placeholder short-term strategy"],
+                "Long-term Vision (3-12 months)": ["Placeholder long-term vision"]
+            },
+            "Risk Assessment": ["Placeholder risk"],
+            "Success Metrics to Track": ["Placeholder KPI"]
+        }
+    async def _generate_hr_insights(self, query: str, context: str) -> str:
+        insight_prompt = f"""
+As an expert HR analytics consultant, analyze the following LinkedIn employer branding data:
+{context}
+User Query: {query}
+Please provide insights in this structured format:
+## Key Findings
+- [3-5 bullet points of main discoveries]
+## Performance Metrics
+- Current performance vs industry benchmarks
+- Trend analysis (improving/declining/stable)
+## Actionable Recommendations
+1. **Immediate Actions (0-30 days)**
+  - [Specific, measurable actions]
+2. **Short-term Strategy (1-3 months)**
+  - [Strategic initiatives]
+3. **Long-term Vision (3-12 months)**
+  - [Comprehensive improvements]
+## Risk Assessment
+- Potential challenges or red flags
+- Mitigation strategies
+## Success Metrics to Track
+- KPIs to monitor progress
+- Reporting frequency recommendations
+"""
+        if not client: return "Error: AI client not configured for generating HR insights."
+        api_call_contents = [{"role": "user", "parts": [{"text": insight_prompt}]}]
+        # Construct SafetySetting objects if types.SafetySetting is available
+        api_safety_settings_objects = []
+        if types and hasattr(types, 'SafetySetting'):
+            for ss_dict in self.safety_settings_list_of_dicts:
+                try:
+                    # Use types.HarmCategory and types.HarmBlockThreshold directly
+                    category = getattr(types.HarmCategory, ss_dict['category'].split('.')[-1] if isinstance(ss_dict['category'], str) else ss_dict['category'].name, types.HarmCategory.HARM_CATEGORY_UNSPECIFIED)
+                    threshold = getattr(types.HarmBlockThreshold, ss_dict['threshold'].split('.')[-1] if isinstance(ss_dict['threshold'], str) else ss_dict['threshold'].name, types.HarmBlockThreshold.BLOCK_NONE)
+                    api_safety_settings_objects.append(types.SafetySetting(category=category, threshold=threshold))
+                except Exception as e_ss:
+                    logging.warning(f"Could not create SafetySetting object from {ss_dict}: {e_ss}. Using dict.")
+                    api_safety_settings_objects.append(ss_dict) # Fallback to dict if creation fails
+        else: # Fallback if types.SafetySetting is not available
+            api_safety_settings_objects = self.safety_settings_list_of_dicts
+        api_generation_config_obj = None
+        if types and hasattr(types, 'GenerateContentConfig'):
+            api_generation_config_obj = types.GenerateContentConfig(
+                **self.generation_config_dict,
+                safety_settings=api_safety_settings_objects
+            )
+        else: # Fallback if types.GenerateContentConfig is not available
+            config_dict_for_api = {**self.generation_config_dict, "safety_settings": api_safety_settings_objects}
+            api_generation_config_obj = config_dict_for_api
+        try:
+            response = await asyncio.to_thread(
+                client.models.generate_content,
+                model=self.llm_model_name,
+                contents=api_call_contents,
+                config=api_generation_config_obj
+            )
+            if not response.candidates: return "HR insights generation failed: No response from AI."
+            return response.text.strip()
+        except Exception as e:
+            logging.error(f"Error generating HR insights: {e}", exc_info=True)
+            return f"Error generating HR insights: {str(e)}"
+    def _validate_query(self, query: str) -> bool:
+        if not query or len(query.strip()) < 3:
+            logging.warning(f"Query too short: '{query}'")
+            return False
+        hr_keywords = ['employee', 'talent', 'hiring', 'culture', 'brand', 'engagement', 'retention', 'follower', 'post', 'mention', 'linkedin']
+        if not any(keyword in query.lower() for keyword in hr_keywords):
+            logging.warning(f"Query may not be HR/LinkedIn-relevant: {query[:50]}")
+        return True
+    def _get_query_help_message(self) -> str:
+        return ("I'm here to help with Employer Branding analysis on LinkedIn data. "
+                "Please ask specific questions about your followers, posts, or mentions. "
+                "For example: 'What are the top industries of my followers?' or 'Analyze the engagement trend of my recent posts.'")
+    async def _check_system_readiness(self) -> dict:
+        logging.debug("_check_system_readiness is a placeholder.")
+        if not client: return {'ready': False, 'reason': 'AI Client not initialized.'}
+        if self.rag_system.embeddings is None:
+             logging.warning("RAG embeddings not yet initialized. Proceeding, but RAG context will be unavailable.")
+        return {'ready': True, 'reason': 'System appears ready.'}
+    def _get_fallback_response(self, query: str) -> str:
+        logging.error(f"Executing fallback response for query: {query[:50]}")
+        return "I encountered an unexpected issue while processing your request. Please try rephrasing your query or try again later."
+    async def _core_query_processing(self, raw_user_query_this_turn: str) -> str:
         augmented_current_user_prompt_text = await self._build_prompt_for_current_turn(raw_user_query_this_turn)
+        api_call_contents = list(self.chat_history)
         api_call_contents.append({"role": "user", "parts": [{"text": augmented_current_user_prompt_text}]})
         logging.debug(f"Sending to GenAI. Total turns in content: {len(api_call_contents)}")
+        api_safety_settings_objects = []
+        if types and hasattr(types, 'SafetySetting'):
             for ss_dict in self.safety_settings_list_of_dicts:
                 try:
+                    category_enum_val = ss_dict['category']
+                    threshold_enum_val = ss_dict['threshold']
+                    # If they are already enum members, use them directly
+                    if not isinstance(category_enum_val, str): # Assumes it's an enum member
+                        category = category_enum_val
+                    else: # If string, try to get from types.HarmCategory
+                         category = getattr(types.HarmCategory, category_enum_val.split('.')[-1], types.HarmCategory.HARM_CATEGORY_UNSPECIFIED)
+                    if not isinstance(threshold_enum_val, str): # Assumes it's an enum member
+                        threshold = threshold_enum_val
+                    else: # If string, try to get from types.HarmBlockThreshold
+                        threshold = getattr(types.HarmBlockThreshold, threshold_enum_val.split('.')[-1], types.HarmBlockThreshold.BLOCK_NONE)
+                    api_safety_settings_objects.append(types.SafetySetting(category=category, threshold=threshold))
+                except Exception as e_ss_core:
+                    logging.warning(f"Could not create SafetySetting object from {ss_dict} in core: {e_ss_core}. Using dict.")
+                    api_safety_settings_objects.append(ss_dict) # Fallback
+        else:
+            api_safety_settings_objects = self.safety_settings_list_of_dicts
+        api_generation_config_obj = None
+        if types and hasattr(types, 'GenerateContentConfig'):
+             api_generation_config_obj = types.GenerateContentConfig(
+                **self.generation_config_dict,
+                safety_settings=api_safety_settings_objects
             )
+        else:
+            logging.error("GenerateContentConfig type not available. API call might fail.")
+            config_dict_for_api = {**self.generation_config_dict, "safety_settings": api_safety_settings_objects}
+            api_generation_config_obj = config_dict_for_api
+        response = await asyncio.to_thread(
+            client.models.generate_content,
+            model=self.llm_model_name,
+            contents=api_call_contents,
+            config=api_generation_config_obj
+        )
+        if not response.candidates:
+            block_reason = response.prompt_feedback.block_reason if response.prompt_feedback else "Unknown"
+            block_message = response.prompt_feedback.block_reason_message if response.prompt_feedback else ""
+            logging.warning(f"AI response blocked or empty. Reason: {block_reason}, Msg: {block_message}")
+            error_message = f"The AI's response was blocked. Reason: {block_reason}."
+            if block_message: error_message += f" Details: {block_message}"
+            return error_message
+        return response.text.strip()
+    async def _process_query_with_timeout(self, raw_user_query_this_turn: str, timeout_seconds: int = 60) -> str:
+        try:
+            return await asyncio.wait_for(self._core_query_processing(raw_user_query_this_turn), timeout=timeout_seconds)
+        except asyncio.TimeoutError:
+            logging.error(f"Query processing timed out after {timeout_seconds} seconds for query: {raw_user_query_this_turn[:50]}")
+            return "I'm sorry, but your request took too long to process. Please try a simpler query or try again later."
+    async def process_query(self, raw_user_query_this_turn: str) -> str:
+        if not client:
+            logging.error("GenAI client not initialized. Cannot process query.")
+            return "Error: The AI Agent is not available due to a configuration issue with the AI service."
+        if not self._validate_query(raw_user_query_this_turn): return self._get_query_help_message()
+        readiness_check = await self._check_system_readiness()
+        if not readiness_check['ready']: return f"System not ready: {readiness_check['reason']}"
+        max_retries = 2
+        for attempt in range(max_retries + 1):
+            try:
+                response_text = await self._process_query_with_timeout(raw_user_query_this_turn)
+                if "The AI's response was blocked" in response_text: return response_text
+                logging.info(f"Successfully received AI response (attempt {attempt+1}): {response_text[:100]}")
+                return response_text
+            except RateLimitError as rle:
+                logging.warning(f"Rate limit encountered on attempt {attempt + 1}: {rle}. Retrying after backoff...")
+                if attempt == max_retries:
+                    logging.error("Max retries reached due to rate limiting.")
+                    return "The AI service is currently busy. Please try again in a few moments."
+                await asyncio.sleep(2 ** attempt)
+            except ValidationError as ve:
+                logging.warning(f"Validation error during processing: {ve}")
+                return f"Query validation failed: {str(ve)}"
+            except Exception as e:
+                logging.error(f"Error during GenAI call on attempt {attempt + 1}: {e}", exc_info=True)
+                if attempt == max_retries:
+                    logging.error("Max retries reached due to general errors.")
+                    return self._get_fallback_response(raw_user_query_this_turn)
+        return self._get_fallback_response(raw_user_query_this_turn)
+    def _classify_query_type(self, query: str) -> str:
+        query_lower = query.lower()
+        if any(word in query_lower for word in ['trend', 'growth', 'change', 'time']): return 'trend_analysis'
+        elif any(word in query_lower for word in ['compare', 'benchmark', 'versus']): return 'comparative_analysis'
+        elif any(word in query_lower for word in ['predict', 'forecast', 'future']): return 'predictive_analysis'
+        elif any(word in query_lower for word in ['recommend', 'suggest', 'improve', 'advice', 'help me with']): return 'recommendation_engine'
+        elif any(word in query_lower for word in ['what is', 'explain', 'define']): return 'definition_explanation'
+        else: return 'general_inquiry'
+    def clear_chat_history(self):
         self.chat_history = []
         logging.info("EmployerBrandingAgent chat history cleared by request.")
 def get_all_schemas_representation(all_dataframes: dict) -> str:
+    if not all_dataframes: return "No DataFrames are currently loaded."
     schema_descriptions = ["DataFrames currently available in the application state:"]
     for key, df in all_dataframes.items():
         df_name = f"df_{key}"
         if df.empty:
             schema = f"\n--- DataFrame: {df_name} ---\nStatus: Empty\nShape: {shape}\nColumns: {columns}"
         else:
+            sample_data_str = df.head(2).to_markdown(index=False)
             schema = (f"\n--- DataFrame: {df_name} ---\nShape: {shape}\nColumns: {columns}\n\n<details><summary>Sample Data (first 2 rows of {df_name}):</summary>\n\n{sample_data_str}\n\n</details>")
         schema_descriptions.append(schema)
     return "\n".join(schema_descriptions)
+async def test_rag_retrieval_accuracy():
+    logging.info("Running RAG retrieval accuracy test...")
+    test_embedding_model = GEMINI_EMBEDDING_MODEL_NAME
+    if not client:
+        logging.error("Cannot run RAG test: GenAI client not initialized.")
+        return
+    test_docs_data = {
+        'text': [
+            'Strategies for improving employee engagement include regular feedback and recognition programs.',
+            'Effective talent acquisition requires a strong employer brand and a streamlined hiring process.',
+            'Company culture is a key driver of employee satisfaction and retention.',
+            'Analyzing LinkedIn post performance can reveal insights into content effectiveness.'
+        ]
+    }
+    test_docs_df = pd.DataFrame(test_docs_data)
+    rag_system = AdvancedRAGSystem(test_docs_df, test_embedding_model)
+    logging.info("Test RAG: Initializing embeddings...")
+    await rag_system.initialize_embeddings()
+    if rag_system.embeddings is None or rag_system.embeddings.size == 0:
+        logging.error("Test RAG: Embeddings not initialized properly.")
+        return
+    test_queries = {
+        "employee engagement": "engagement",
+        "hiring talent": "acquisition",
+        "company culture": "culture",
+        "linkedin posts": "linkedin"
+    }
+    all_tests_passed = True
+    for query, keyword in test_queries.items():
+        logging.info(f"Test RAG: Retrieving for query: '{query}'")
+        result = await rag_system.retrieve_relevant_info(query, top_k=1, min_similarity=0.1)
+        if result and keyword.lower() in result.lower():
+            logging.info(f"Test RAG: PASSED for query '{query}'. Found relevant doc.")
+        else:
+            logging.error(f"Test RAG: FAILED for query '{query}'. Expected keyword '{keyword}', got: {result[:100]}...")
+            all_tests_passed = False
+    if all_tests_passed: logging.info("All RAG retrieval accuracy tests passed.")
+    else: logging.error("Some RAG retrieval accuracy tests FAILED.")