Spaces:

GuglielmoTor
/

LinkedinMonitor

Running

App Files Files Community

GuglielmoTor commited on 23 days ago

Commit

c9f7ea0

verified ·

1 Parent(s): f1d603c

Update eb_agent_module.py

Browse files

Files changed (1) hide show

eb_agent_module.py +37 -68

eb_agent_module.py CHANGED Viewed

@@ -274,21 +274,12 @@ class PandasLLM:
             logging.error("PandasLLM: Gemini model not initialized. Cannot call API.")
             return "# Error: Gemini model not available. Check API key and configuration."
-        # Construct content for Gemini API
-        # The new API expects a list of Content objects, or a list of dicts
-        # For chat-like interaction, history should be managed.
-        # For single-turn code generation, a simple user prompt might suffice.
-        # For now, let's assume single turn for code generation for simplicity in PandasLLM context
-        # If this were a conversational agent, history would be crucial.
         contents_for_api = [{"role": "user", "parts": [{"text": prompt_text}]}]
-        if history: # If history is provided, prepend it
-            # Ensure history is in the correct format [{'role':'user/model', 'parts':[{'text':...}]}]
-            # This part might need adjustment based on how history is structured by the calling agent
             formatted_history = []
             for entry in history:
-                role = entry.get("role", "user") # Default to user if role not specified
-                if role == "assistant": role = "model" # Gemini uses "model" for assistant
                 formatted_history.append({"role": role, "parts": [{"text": entry.get("content", "")}]})
             contents_for_api = formatted_history + contents_for_api
@@ -302,21 +293,18 @@ class PandasLLM:
         logging.info(f"\n--- Calling Gemini API with prompt (first 500 chars of last message): ---\n{contents_for_api[-1]['parts'][0]['text'][:500]}...\n-------------------------------------------------------\n")
         try:
-            # Using asyncio.to_thread for the blocking SDK call
             response = await asyncio.to_thread(
                 self.model.generate_content,
-                contents=contents_for_api, # Pass the constructed content
                 generation_config=gen_config_obj,
-                # stream=False # Ensure non-streaming for this setup
             )
             if response.prompt_feedback and response.prompt_feedback.block_reason:
                 reason = response.prompt_feedback.block_reason
-                reason_name = getattr(reason, 'name', str(reason)) # Handle if reason is enum or string
                 logging.warning(f"Gemini API call blocked by prompt feedback: {reason_name}")
                 return f"# Error: Prompt blocked due to content policy: {reason_name}."
-            # Try to extract text, accounting for different response structures
             llm_output = ""
             if hasattr(response, 'text') and response.text:
                 llm_output = response.text
@@ -325,12 +313,11 @@ class PandasLLM:
                 if candidate.content and candidate.content.parts:
                     llm_output = "".join(part.text for part in candidate.content.parts if hasattr(part, 'text'))
-                # Check finish reason if output is empty
                 if not llm_output:
                     finish_reason_val = candidate.finish_reason
-                    finish_reason = getattr(finish_reason_val, 'name', str(finish_reason_val)) # Handle enum or string
                     logging.warning(f"No text content in response candidate. Finish reason: {finish_reason}")
-                    if finish_reason == "SAFETY": # Check against genai_types.FinishReason.SAFETY if available
                         return f"# Error: Response generation stopped due to safety reasons ({finish_reason})."
                     elif finish_reason == "RECITATION":
                          return f"# Error: Response generation stopped due to recitation policy ({finish_reason})."
@@ -342,12 +329,11 @@ class PandasLLM:
             logging.info(f"--- Gemini API Response (first 300 chars): ---\n{llm_output[:300]}...\n--------------------------------------------------\n")
             return llm_output
-        except AttributeError as ae: # Catch issues with dummy genai objects if API key missing
             logging.error(f"AttributeError during Gemini call (likely due to missing API key/dummy objects): {ae}", exc_info=True)
             return f"# Error (Attribute): {type(ae).__name__} - {ae}. Check if GEMINI_API_KEY is set and google.genai library is correctly installed."
         except Exception as e:
             logging.error(f"Error calling Gemini API: {e}", exc_info=True)
-            # More specific error messages
             if "API_KEY_INVALID" in str(e) or "API key not valid" in str(e):
                 return "# Error: Gemini API key is not valid. Please check your GEMINI_API_KEY environment variable."
             if "400" in str(e) and "model" in str(e).lower() and ("not found" in str(e).lower() or "does not exist" in str(e).lower()):
@@ -360,59 +346,51 @@ class PandasLLM:
     async def query(self, prompt_with_query_and_context: str, dataframes_dict: dict, history: list = None) -> str:
-        """
-        Sends a prompt to the LLM and optionally executes the returned Python code in a sandbox.
-        dataframes_dict: Keys are 'base_name' (e.g., 'profiles'), values are pd.DataFrame.
-                         In exec, they are available as 'df_base_name'.
-        history: Optional chat history for conversational context.
-        """
         llm_response_text = await self._call_gemini_api_async(prompt_with_query_and_context, history)
         if self.force_sandbox:
-            # Attempt to extract Python code block
             code_to_execute = ""
             if "```python" in llm_response_text:
                 try:
                     code_to_execute = llm_response_text.split("```python\n", 1)[1].split("\n```", 1)[0]
                 except IndexError:
-                    # This might happen if the format is slightly off, e.g. no newline after ```python
                     try:
                         code_to_execute = llm_response_text.split("```python", 1)[1].split("```", 1)[0]
-                        if code_to_execute.startswith("\n"): code_to_execute = code_to_execute[1:] # remove leading newline
-                        if code_to_execute.endswith("\n"): code_to_execute = code_to_execute[:-1] # remove trailing newline
                     except IndexError:
-                         code_to_execute = "" # Fallback, code not extracted
                          logging.warning("Could not extract Python code using primary or secondary split method.")
             if llm_response_text.startswith("# Error:") or not code_to_execute:
                 error_prefix = "LLM did not return a valid Python code block or an error occurred."
                 if llm_response_text.startswith("# Error:"): error_prefix = "An error occurred during LLM call."
                 elif not code_to_execute: error_prefix = "Could not extract Python code from LLM response."
-                # Sanitize llm_response_text before printing to avoid breaking f-string or print
                 safe_llm_response = str(llm_response_text).replace("'''", "'").replace('"""', '"')
-                code_for_error_display = f"print(f'''{error_prefix}\\nRaw LLM Response (may be truncated):\\n{safe_llm_response[:1000]}''')"
                 logging.warning(f"Problem with LLM response for sandbox: {error_prefix}")
-                # Fallback to printing the raw response or error
-                llm_response_text_for_sandbox_error = code_for_error_display
             logging.info(f"\n--- Code to Execute (from LLM, if sandbox): ---\n{code_to_execute}\n------------------------------------------------\n")
-            # Define a restricted set of built-ins
-            safe_builtins = {name: obj for name, obj in __builtins__.__dict__.items() if not name.startswith('_')}
-            # More aggressive removal (example, adjust as needed for security)
-            # For a web app, this sandboxing is CRITICAL.
-            # Consider using a dedicated sandboxing library if security is paramount.
             unsafe_builtins = ['eval', 'exec', 'open', 'compile', 'input', 'memoryview', 'vars', 'globals', 'locals', '__import__']
             for ub in unsafe_builtins:
                 safe_builtins.pop(ub, None)
-            # Prepare globals for exec: pandas, numpy, dataframes, and restricted builtins
             exec_globals = {'pd': pd, 'np': np, '__builtins__': safe_builtins}
             for name, df_instance in dataframes_dict.items():
-                exec_globals[f"df_{name}"] = df_instance # e.g. df_follower_stats, df_posts
             from io import StringIO
             import sys
@@ -421,12 +399,12 @@ class PandasLLM:
             final_output_str = ""
             try:
-                if code_to_execute: # Only execute if code was extracted
-                    exec(code_to_execute, exec_globals, {}) # Empty locals
                     output_val = captured_output.getvalue()
                     final_output_str = output_val if output_val else "# Code executed successfully, but no explicit print() output was generated by the code."
                     logging.info(f"--- Sandbox Execution Output: ---\n{final_output_str}\n-------------------------\n")
-                else: # No code to execute, use the prepared error message
                     exec(llm_response_text_for_sandbox_error, exec_globals, {})
                     final_output_str = captured_output.getvalue()
                     logging.warning(f"--- Sandbox Fallback Output (No Code Executed): ---\n{final_output_str}\n-------------------------\n")
@@ -434,11 +412,11 @@ class PandasLLM:
             except Exception as e:
                 error_msg = f"# Error executing LLM-generated code:\n# {type(e).__name__}: {str(e)}\n# --- Code that caused error: ---\n{textwrap.indent(code_to_execute, '# ')}"
                 final_output_str = error_msg
-                logging.error(error_msg, exc_info=False) # exc_info=False to avoid huge traceback in Gradio UI
             finally:
-                sys.stdout = old_stdout # Reset stdout
             return final_output_str
-        else: # Not force_sandbox, return LLM text directly
             return llm_response_text
@@ -449,13 +427,12 @@ class EmployerBrandingAgent:
                  data_privacy=True, force_sandbox=True):
         self.pandas_llm = PandasLLM(llm_model_name, generation_config_params, safety_settings, data_privacy, force_sandbox)
         self.rag_system = AdvancedRAGSystem(rag_documents_df, embedding_model_name)
-        self.all_dataframes = all_dataframes # Keys are 'base_name', values are pd.DataFrame
         self.schemas_representation = get_all_schemas_representation(self.all_dataframes)
-        self.chat_history = [] # Stores conversation history for this agent instance
         logging.info("EmployerBrandingAgent Initialized.")
     def _build_prompt(self, user_query: str, role="Employer Branding Analyst", task_decomposition_hint=None, cot_hint=True) -> str:
-        # Base prompt
         prompt = f"You are a helpful and expert '{role}'. Your primary goal is to assist with analyzing LinkedIn-related data using Pandas DataFrames.\n"
         prompt += "You will be provided with schemas for available Pandas DataFrames and a user query.\n"
@@ -471,13 +448,12 @@ class EmployerBrandingAgent:
             prompt += "If the query is ambiguous or requires clarification, ask for it instead of making assumptions. If the query cannot be answered with the given data, state that clearly.\n"
             prompt += "If the query is not about data analysis or code generation (e.g. 'hello', 'how are you?'), respond politely and briefly, do not attempt to generate code.\n"
             prompt += "Structure your code clearly. Add comments (#) to explain each step of your logic.\n"
-        else: # Textual response mode
             prompt += "Your task is to analyze the data and provide a comprehensive textual answer to the user query. You can explain your reasoning step-by-step.\n"
         prompt += "\n--- AVAILABLE DATA AND SCHEMAS ---\n"
         prompt += self.schemas_representation
-        # RAG Context (only add if relevant context is found)
         rag_context = self.rag_system.retrieve_relevant_info(user_query)
         if rag_context and "[RAG Context]" in rag_context and "No specific pre-defined context found" not in rag_context and "No highly relevant passages found" not in rag_context:
             prompt += f"\n--- ADDITIONAL CONTEXT (from internal knowledge base, consider this information) ---\n{rag_context}\n"
@@ -497,7 +473,7 @@ class EmployerBrandingAgent:
                 prompt += "5. Ensure output: Use `print()` for all results that should be displayed. For DataFrames, you can print the DataFrame directly, or `df.to_string()` if it's large.\n"
                 prompt += "6. Review: Check for correctness, efficiency, and adherence to the prompt (especially the `print()` requirement).\n"
                 prompt += "7. Generate ONLY the Python code block starting with ```python and ending with ```. No explanations outside the code block's comments.\n"
-            else: # Textual CoT
                 prompt += "\n--- INSTRUCTIONS FOR RESPONSE (Chain of Thought) ---\n"
                 prompt += "1. Understand the query fully.\n"
                 prompt += "2. Identify the relevant data sources (DataFrames and columns).\n"
@@ -512,22 +488,15 @@ class EmployerBrandingAgent:
         logging.info(f"\n=== Processing Query for Role: {role} ===")
         logging.info(f"User Query: {user_query}")
-        # Add user query to chat history
         self.chat_history.append({"role": "user", "content": user_query})
         full_prompt = self._build_prompt(user_query, role, task_decomposition_hint, cot_hint)
-        # Pass relevant parts of chat history to pandas_llm.query if needed for context
-        # For now, PandasLLM's _call_gemini_api_async is set up for single turn for code gen,
-        # but can be adapted if conversational context for code gen becomes important.
-        # The full_prompt itself is rebuilt each time, incorporating the latest user_query.
-        response_text = await self.pandas_llm.query(full_prompt, self.all_dataframes, history=self.chat_history[:-1]) # Pass history excluding current query
-        # Add assistant response to chat history
         self.chat_history.append({"role": "assistant", "content": response_text})
-        # Limit history size to avoid overly long prompts in future turns (e.g., last 10 messages)
-        MAX_HISTORY_TURNS = 5 # 5 pairs of user/assistant messages
         if len(self.chat_history) > MAX_HISTORY_TURNS * 2:
             self.chat_history = self.chat_history[-(MAX_HISTORY_TURNS * 2):]

             logging.error("PandasLLM: Gemini model not initialized. Cannot call API.")
             return "# Error: Gemini model not available. Check API key and configuration."
         contents_for_api = [{"role": "user", "parts": [{"text": prompt_text}]}]
+        if history:
             formatted_history = []
             for entry in history:
+                role = entry.get("role", "user")
+                if role == "assistant": role = "model"
                 formatted_history.append({"role": role, "parts": [{"text": entry.get("content", "")}]})
             contents_for_api = formatted_history + contents_for_api
         logging.info(f"\n--- Calling Gemini API with prompt (first 500 chars of last message): ---\n{contents_for_api[-1]['parts'][0]['text'][:500]}...\n-------------------------------------------------------\n")
         try:
             response = await asyncio.to_thread(
                 self.model.generate_content,
+                contents=contents_for_api,
                 generation_config=gen_config_obj,
             )
             if response.prompt_feedback and response.prompt_feedback.block_reason:
                 reason = response.prompt_feedback.block_reason
+                reason_name = getattr(reason, 'name', str(reason))
                 logging.warning(f"Gemini API call blocked by prompt feedback: {reason_name}")
                 return f"# Error: Prompt blocked due to content policy: {reason_name}."
             llm_output = ""
             if hasattr(response, 'text') and response.text:
                 llm_output = response.text
                 if candidate.content and candidate.content.parts:
                     llm_output = "".join(part.text for part in candidate.content.parts if hasattr(part, 'text'))
                 if not llm_output:
                     finish_reason_val = candidate.finish_reason
+                    finish_reason = getattr(finish_reason_val, 'name', str(finish_reason_val))
                     logging.warning(f"No text content in response candidate. Finish reason: {finish_reason}")
+                    if finish_reason == "SAFETY":
                         return f"# Error: Response generation stopped due to safety reasons ({finish_reason})."
                     elif finish_reason == "RECITATION":
                          return f"# Error: Response generation stopped due to recitation policy ({finish_reason})."
             logging.info(f"--- Gemini API Response (first 300 chars): ---\n{llm_output[:300]}...\n--------------------------------------------------\n")
             return llm_output
+        except AttributeError as ae:
             logging.error(f"AttributeError during Gemini call (likely due to missing API key/dummy objects): {ae}", exc_info=True)
             return f"# Error (Attribute): {type(ae).__name__} - {ae}. Check if GEMINI_API_KEY is set and google.genai library is correctly installed."
         except Exception as e:
             logging.error(f"Error calling Gemini API: {e}", exc_info=True)
             if "API_KEY_INVALID" in str(e) or "API key not valid" in str(e):
                 return "# Error: Gemini API key is not valid. Please check your GEMINI_API_KEY environment variable."
             if "400" in str(e) and "model" in str(e).lower() and ("not found" in str(e).lower() or "does not exist" in str(e).lower()):
     async def query(self, prompt_with_query_and_context: str, dataframes_dict: dict, history: list = None) -> str:
         llm_response_text = await self._call_gemini_api_async(prompt_with_query_and_context, history)
         if self.force_sandbox:
             code_to_execute = ""
             if "```python" in llm_response_text:
                 try:
                     code_to_execute = llm_response_text.split("```python\n", 1)[1].split("\n```", 1)[0]
                 except IndexError:
                     try:
                         code_to_execute = llm_response_text.split("```python", 1)[1].split("```", 1)[0]
+                        if code_to_execute.startswith("\n"): code_to_execute = code_to_execute[1:]
+                        if code_to_execute.endswith("\n"): code_to_execute = code_to_execute[:-1]
                     except IndexError:
+                         code_to_execute = ""
                          logging.warning("Could not extract Python code using primary or secondary split method.")
+            llm_response_text_for_sandbox_error = "" # Initialize this variable
             if llm_response_text.startswith("# Error:") or not code_to_execute:
                 error_prefix = "LLM did not return a valid Python code block or an error occurred."
                 if llm_response_text.startswith("# Error:"): error_prefix = "An error occurred during LLM call."
                 elif not code_to_execute: error_prefix = "Could not extract Python code from LLM response."
                 safe_llm_response = str(llm_response_text).replace("'''", "'").replace('"""', '"')
+                llm_response_text_for_sandbox_error = f"print(f'''{error_prefix}\\nRaw LLM Response (may be truncated):\\n{safe_llm_response[:1000]}''')"
                 logging.warning(f"Problem with LLM response for sandbox: {error_prefix}")
             logging.info(f"\n--- Code to Execute (from LLM, if sandbox): ---\n{code_to_execute}\n------------------------------------------------\n")
+            # --- THIS IS THE CORRECTED SECTION ---
+            # In the exec environment, __builtins__ is a dict.
+            # We iterate over its items directly.
+            safe_builtins = {}
+            if isinstance(__builtins__, dict):
+                safe_builtins = {name: obj for name, obj in __builtins__.items() if not name.startswith('_')}
+            else: # Fallback if __builtins__ is a module (e.g. in standard Python interpreter)
+                safe_builtins = {name: obj for name, obj in __builtins__.__dict__.items() if not name.startswith('_')}
+            # --- END OF CORRECTION ---
             unsafe_builtins = ['eval', 'exec', 'open', 'compile', 'input', 'memoryview', 'vars', 'globals', 'locals', '__import__']
             for ub in unsafe_builtins:
                 safe_builtins.pop(ub, None)
             exec_globals = {'pd': pd, 'np': np, '__builtins__': safe_builtins}
             for name, df_instance in dataframes_dict.items():
+                exec_globals[f"df_{name}"] = df_instance
             from io import StringIO
             import sys
             final_output_str = ""
             try:
+                if code_to_execute:
+                    exec(code_to_execute, exec_globals, {})
                     output_val = captured_output.getvalue()
                     final_output_str = output_val if output_val else "# Code executed successfully, but no explicit print() output was generated by the code."
                     logging.info(f"--- Sandbox Execution Output: ---\n{final_output_str}\n-------------------------\n")
+                else:
                     exec(llm_response_text_for_sandbox_error, exec_globals, {})
                     final_output_str = captured_output.getvalue()
                     logging.warning(f"--- Sandbox Fallback Output (No Code Executed): ---\n{final_output_str}\n-------------------------\n")
             except Exception as e:
                 error_msg = f"# Error executing LLM-generated code:\n# {type(e).__name__}: {str(e)}\n# --- Code that caused error: ---\n{textwrap.indent(code_to_execute, '# ')}"
                 final_output_str = error_msg
+                logging.error(error_msg, exc_info=False)
             finally:
+                sys.stdout = old_stdout
             return final_output_str
+        else:
             return llm_response_text
                  data_privacy=True, force_sandbox=True):
         self.pandas_llm = PandasLLM(llm_model_name, generation_config_params, safety_settings, data_privacy, force_sandbox)
         self.rag_system = AdvancedRAGSystem(rag_documents_df, embedding_model_name)
+        self.all_dataframes = all_dataframes
         self.schemas_representation = get_all_schemas_representation(self.all_dataframes)
+        self.chat_history = []
         logging.info("EmployerBrandingAgent Initialized.")
     def _build_prompt(self, user_query: str, role="Employer Branding Analyst", task_decomposition_hint=None, cot_hint=True) -> str:
         prompt = f"You are a helpful and expert '{role}'. Your primary goal is to assist with analyzing LinkedIn-related data using Pandas DataFrames.\n"
         prompt += "You will be provided with schemas for available Pandas DataFrames and a user query.\n"
             prompt += "If the query is ambiguous or requires clarification, ask for it instead of making assumptions. If the query cannot be answered with the given data, state that clearly.\n"
             prompt += "If the query is not about data analysis or code generation (e.g. 'hello', 'how are you?'), respond politely and briefly, do not attempt to generate code.\n"
             prompt += "Structure your code clearly. Add comments (#) to explain each step of your logic.\n"
+        else:
             prompt += "Your task is to analyze the data and provide a comprehensive textual answer to the user query. You can explain your reasoning step-by-step.\n"
         prompt += "\n--- AVAILABLE DATA AND SCHEMAS ---\n"
         prompt += self.schemas_representation
         rag_context = self.rag_system.retrieve_relevant_info(user_query)
         if rag_context and "[RAG Context]" in rag_context and "No specific pre-defined context found" not in rag_context and "No highly relevant passages found" not in rag_context:
             prompt += f"\n--- ADDITIONAL CONTEXT (from internal knowledge base, consider this information) ---\n{rag_context}\n"
                 prompt += "5. Ensure output: Use `print()` for all results that should be displayed. For DataFrames, you can print the DataFrame directly, or `df.to_string()` if it's large.\n"
                 prompt += "6. Review: Check for correctness, efficiency, and adherence to the prompt (especially the `print()` requirement).\n"
                 prompt += "7. Generate ONLY the Python code block starting with ```python and ending with ```. No explanations outside the code block's comments.\n"
+            else:
                 prompt += "\n--- INSTRUCTIONS FOR RESPONSE (Chain of Thought) ---\n"
                 prompt += "1. Understand the query fully.\n"
                 prompt += "2. Identify the relevant data sources (DataFrames and columns).\n"
         logging.info(f"\n=== Processing Query for Role: {role} ===")
         logging.info(f"User Query: {user_query}")
         self.chat_history.append({"role": "user", "content": user_query})
         full_prompt = self._build_prompt(user_query, role, task_decomposition_hint, cot_hint)
+        response_text = await self.pandas_llm.query(full_prompt, self.all_dataframes, history=self.chat_history[:-1])
         self.chat_history.append({"role": "assistant", "content": response_text})
+        MAX_HISTORY_TURNS = 5
         if len(self.chat_history) > MAX_HISTORY_TURNS * 2:
             self.chat_history = self.chat_history[-(MAX_HISTORY_TURNS * 2):]