GuglielmoTor commited on
Commit
a5ee064
·
verified ·
1 Parent(s): 56bc649

Update eb_agent_module.py

Browse files
Files changed (1) hide show
  1. eb_agent_module.py +651 -230
eb_agent_module.py CHANGED
@@ -9,395 +9,693 @@ import textwrap
9
 
10
  # Attempt to import Google Generative AI and related types
11
  try:
12
- from google import genai
13
- from google.genai import types as genai_types
 
 
 
14
  except ImportError:
15
  print("Google Generative AI library not found. Please install it: pip install google-generativeai")
16
  # Define dummy classes/functions if the import fails, to allow the rest of the script to be parsed
17
  class genai: # type: ignore
18
  @staticmethod
19
- def configure(api_key): pass
20
-
21
- @staticmethod
22
- def Client(api_key=None):
23
- class DummyModels:
 
 
 
 
 
 
24
  @staticmethod
25
- def generate_content(model=None, contents=None, config=None, safety_settings=None): # Added config, kept safety_settings for older dummy
26
- print(f"Dummy genai.Client.models.generate_content called for model: {model} with config: {config}, safety_settings: {safety_settings}")
27
  class DummyPart:
28
  def __init__(self, text): self.text = text
29
  class DummyContent:
30
- def __init__(self): self.parts = [DummyPart("# Dummy response from dummy client")]
31
  class DummyCandidate:
32
  def __init__(self):
33
  self.content = DummyContent()
34
- self.finish_reason = "DUMMY"
35
- self.safety_ratings = [] # Ensure this attribute exists
 
 
36
  class DummyResponse:
37
  def __init__(self):
38
  self.candidates = [DummyCandidate()]
39
- self.prompt_feedback = None # Ensure this attribute exists
40
- @property
41
- def text(self):
42
- if self.candidates and self.candidates[0].content and self.candidates[0].content.parts:
43
- return "".join(p.text for p in self.candidates[0].content.parts)
44
- return ""
45
  return DummyResponse()
46
 
47
- class DummyClient:
48
- def __init__(self): self.models = DummyModels()
49
-
50
- if api_key: return DummyClient()
51
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  @staticmethod
54
- def GenerativeModel(model_name):
55
- print(f"Dummy genai.GenerativeModel called for model: {model_name}")
56
- return None
57
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  @staticmethod
59
- def embed_content(model, content, task_type, title=None):
60
- print(f"Dummy genai.embed_content called for model: {model}")
61
- return {"embedding": [0.1] * 768}
 
62
 
63
  class genai_types: # type: ignore
 
64
  @staticmethod
65
- def GenerateContentConfig(**kwargs): # The dummy now just returns the kwargs
66
- print(f"Dummy genai_types.GenerateContentConfig called with: {kwargs}")
67
- return kwargs
68
-
69
- # Dummy SafetySetting to allow instantiation if real genai_types is missing
70
  @staticmethod
71
  def SafetySetting(category, threshold):
72
  print(f"Dummy SafetySetting created: category={category}, threshold={threshold}")
73
  return {"category": category, "threshold": threshold} # Return a dict for dummy
74
 
75
- class BlockReason:
76
- SAFETY = "SAFETY"
77
- class HarmCategory:
78
  HARM_CATEGORY_UNSPECIFIED = "HARM_CATEGORY_UNSPECIFIED"
79
  HARM_CATEGORY_HARASSMENT = "HARM_CATEGORY_HARASSMENT"
80
  HARM_CATEGORY_HATE_SPEECH = "HARM_CATEGORY_HATE_SPEECH"
81
  HARM_CATEGORY_SEXUALLY_EXPLICIT = "HARM_CATEGORY_SEXUALLY_EXPLICIT"
82
  HARM_CATEGORY_DANGEROUS_CONTENT = "HARM_CATEGORY_DANGEROUS_CONTENT"
 
83
  class HarmBlockThreshold:
84
  BLOCK_NONE = "BLOCK_NONE"
85
  BLOCK_LOW_AND_ABOVE = "BLOCK_LOW_AND_ABOVE"
86
  BLOCK_MEDIUM_AND_ABOVE = "BLOCK_MEDIUM_AND_ABOVE"
87
  BLOCK_ONLY_HIGH = "BLOCK_ONLY_HIGH"
88
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  # --- Configuration ---
91
- GEMINI_API_KEY = os.getenv('GEMINI_API_KEY', "")
 
 
92
  LLM_MODEL_NAME = "gemini-2.0-flash"
93
  GEMINI_EMBEDDING_MODEL_NAME = "gemini-embedding-exp-03-07"
94
 
95
- # Base generation configuration for the LLM (without safety settings here)
96
  GENERATION_CONFIG_PARAMS = {
97
- "temperature": 0.2,
98
  "top_p": 1.0,
99
  "top_k": 32,
100
- "max_output_tokens": 4096,
 
101
  }
102
 
103
  # Default safety settings list for Gemini
104
- # This is now a list of SafetySetting objects (or dicts if using dummy)
105
  try:
106
- DEFAULT_SAFETY_SETTINGS = [
107
  genai_types.SafetySetting(
108
  category=genai_types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
109
- threshold=genai_types.HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
110
  ),
111
  genai_types.SafetySetting(
112
  category=genai_types.HarmCategory.HARM_CATEGORY_HARASSMENT,
113
- threshold=genai_types.HarmBlockThreshold.BLOCK_NONE,
114
  ),
115
  genai_types.SafetySetting(
116
  category=genai_types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
117
- threshold=genai_types.HarmBlockThreshold.BLOCK_NONE,
118
  ),
119
  genai_types.SafetySetting(
120
  category=genai_types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
121
- threshold=genai_types.HarmBlockThreshold.BLOCK_NONE,
122
  ),
123
  ]
124
- except AttributeError as e:
125
  logging.warning(f"Could not define DEFAULT_SAFETY_SETTINGS using real genai_types: {e}. Using placeholder list of dicts.")
126
  DEFAULT_SAFETY_SETTINGS = [
127
- {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_LOW_AND_ABOVE"},
128
- {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
129
- {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
130
- {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
131
  ]
132
 
133
 
134
  # Logging setup
135
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(module)s - %(message)s')
136
 
137
  if GEMINI_API_KEY:
138
  try:
139
  genai.configure(api_key=GEMINI_API_KEY)
140
- logging.info(f"Gemini API key configured globally...")
141
  except Exception as e:
142
  logging.error(f"Failed to configure Gemini API globally: {e}", exc_info=True)
143
  else:
144
- logging.warning("GEMINI_API_KEY environment variable not set.")
145
 
146
 
147
- # --- RAG Documents Definition ---
148
  rag_documents_data = {
149
- 'Title': ["Employer Branding Best Practices 2024", "Attracting Tech Talent"],
150
- 'Text': ["Focus on authentic employee stories...", "Tech candidates value challenging projects..."]
151
- }
 
 
 
 
 
 
 
 
 
 
152
  df_rag_documents = pd.DataFrame(rag_documents_data)
153
 
154
  # --- Schema Representation ---
155
  def get_schema_representation(df_name: str, df: pd.DataFrame) -> str:
156
- if df.empty: return f"Schema for DataFrame '{df_name}': Empty.\n"
157
- # Truncated for brevity in example, keep your full version
158
- return f"Schema for DataFrame 'df_{df_name}': {df.columns.tolist()[:5]}...\nSample:\n{df.head(1).to_string()}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  def get_all_schemas_representation(dataframes_dict: dict) -> str:
160
- # Truncated for brevity in example, keep your full version
161
- return "".join(get_schema_representation(name, df) for name, df in dataframes_dict.items() if isinstance(df, pd.DataFrame))
 
162
 
163
 
164
  # --- Advanced RAG System ---
165
- class AdvancedRAGSystem: # Truncated for brevity, assume correct from previous versions
166
  def __init__(self, documents_df: pd.DataFrame, embedding_model_name: str):
167
  self.embedding_model_name = embedding_model_name
168
  self.documents_df = documents_df.copy()
169
- self.embeddings_generated = False
170
- if GEMINI_API_KEY and hasattr(genai, 'embed_content') and not (hasattr(genai.embed_content, '__func__') and genai.embed_content.__func__.__qualname__.startswith('genai.embed_content')):
 
 
171
  try:
172
- self._precompute_embeddings()
173
  self.embeddings_generated = True
174
- logging.info("RAG embeddings precomputed.")
175
- except Exception as e: logging.error(f"RAG precomputation error: {e}")
 
176
  else:
177
- logging.warning("RAG embeddings not precomputed (API key or genai.embed_content issue).")
178
 
179
- def _embed_fn(self, title: str, text: str) -> list[float]:
180
- if not self.embeddings_generated: return [0.0] * 768
 
 
181
  try:
182
- return genai.embed_content(model=self.embedding_model_name, content=text, task_type="retrieval_document", title=title)["embedding"]
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  except Exception as e:
184
- logging.error(f"Error in _embed_fn for '{title}': {e}")
185
  return [0.0] * 768
186
- def _precompute_embeddings(self):
 
187
  if 'Embeddings' not in self.documents_df.columns:
188
  self.documents_df['Embeddings'] = pd.Series(dtype='object')
189
- self.documents_df['Embeddings'] = self.documents_df.apply(lambda row: self._embed_fn(row['Title'], row['Text']), axis=1)
190
- def retrieve_relevant_info(self, query_text: str, top_k: int = 1) -> str:
191
- if not self.embeddings_generated or self.documents_df['Embeddings'].isnull().all():
192
- return "\n[RAG Context]\nEmbeddings not generated or all are null.\n"
193
- # Simplified retrieval logic for brevity
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  try:
195
- query_embedding = np.array(genai.embed_content(model=self.embedding_model_name, content=query_text, task_type="retrieval_query")["embedding"])
196
- # Filter out rows with invalid embeddings before stacking
 
 
 
 
 
197
  valid_embeddings_df = self.documents_df.dropna(subset=['Embeddings'])
198
- valid_embeddings_df = valid_embeddings_df[valid_embeddings_df['Embeddings'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
 
 
 
199
 
200
- if valid_embeddings_df.empty: return "\n[RAG Context]\nNo valid document embeddings for RAG.\n"
201
-
202
  document_embeddings = np.stack(valid_embeddings_df['Embeddings'].apply(np.array).values)
203
- if query_embedding.shape[0] != document_embeddings.shape[1]: return "\n[RAG Context]\nEmbedding dimension mismatch.\n"
 
 
 
204
 
205
  dot_products = np.dot(document_embeddings, query_embedding)
206
- idx = np.argsort(dot_products)[-min(top_k, len(valid_embeddings_df)):][::-1]
207
-
208
- relevant_passages = "".join([f"\n[RAG Context from: '{valid_embeddings_df.iloc[i]['Title']}']\n{valid_embeddings_df.iloc[i]['Text']}\n" for i in idx])
209
- return relevant_passages if relevant_passages else "\n[RAG Context]\nNo relevant passages found.\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  except Exception as e:
211
- logging.error(f"Error in RAG retrieve_relevant_info: {e}")
212
- return f"\n[RAG Context]\nError during RAG retrieval: {e}\n"
213
 
214
 
215
  # --- PandasLLM Class (Gemini-Powered) ---
216
  class PandasLLM:
217
- def __init__(self, llm_model_name: str,
218
- generation_config_dict: dict,
219
- safety_settings_list: list,
220
  data_privacy=True, force_sandbox=True):
221
  self.llm_model_name = llm_model_name
222
- self.generation_config_dict = generation_config_dict
223
- self.safety_settings_list = safety_settings_list
224
  self.data_privacy = data_privacy
225
  self.force_sandbox = force_sandbox
226
- self.client = None
227
- self.generative_model_service = None
228
 
229
  if not GEMINI_API_KEY:
230
- logging.warning("PandasLLM: GEMINI_API_KEY not set.")
231
- else:
 
 
 
 
 
 
 
 
 
 
232
  try:
233
- self.client = genai.Client(api_key=GEMINI_API_KEY)
234
- if self.client and hasattr(self.client, 'models') and hasattr(self.client.models, 'generate_content'):
235
- self.generative_model_service = self.client.models
236
- logging.info(f"PandasLLM: Using client.models for '{self.llm_model_name}'.")
237
- elif self.client and hasattr(self.client, 'generate_content'):
238
- self.generative_model_service = self.client
239
- logging.info(f"PandasLLM: Using client.generate_content for '{self.llm_model_name}'.")
240
- else:
241
- logging.warning(f"PandasLLM: genai.Client suitable 'generate_content' not found.")
 
242
  except Exception as e:
243
- logging.error(f"Failed to initialize PandasLLM with genai.Client: {e}", exc_info=True)
 
 
 
 
244
 
245
- async def _call_gemini_api_async(self, prompt_text: str, history: list = None) -> str:
246
- if not self.generative_model_service:
247
- return "# Error: Gemini client/service not available."
248
 
249
- contents_for_api = []
 
 
 
 
 
 
 
 
 
 
 
250
  if history:
251
  for entry in history:
252
  role = "model" if entry.get("role") == "assistant" else entry.get("role", "user")
253
- contents_for_api.append({"role": role, "parts": [{"text": entry.get("content", "")}]})
254
- contents_for_api.append({"role": "user", "parts": [{"text": prompt_text}]})
255
 
256
- api_config_object = None
257
- try:
258
- api_config_object = genai_types.GenerateContentConfig(
259
- **self.generation_config_dict,
260
- safety_settings=self.safety_settings_list
261
- )
262
- except Exception as e_cfg:
263
- logging.error(f"Error creating GenerateContentConfig object: {e_cfg}.")
264
- api_config_object = {**self.generation_config_dict, "safety_settings": self.safety_settings_list}
265
-
266
- logging.info(f"\n--- Calling Gemini API via Client (model: {self.llm_model_name}) with config: {api_config_object} ---\n")
 
 
 
 
 
 
 
 
 
 
 
267
 
268
  try:
269
- model_id_for_api = self.llm_model_name
270
- if not model_id_for_api.startswith("models/"):
271
- model_id_for_api = f"models/{model_id_for_api}"
272
-
273
- response = await asyncio.to_thread(
274
- self.generative_model_service.generate_content,
275
- model=model_id_for_api,
276
  contents=contents_for_api,
277
- config=api_config_object
278
  )
279
 
280
- if hasattr(response, 'prompt_feedback') and response.prompt_feedback and response.prompt_feedback.block_reason:
281
- return f"# Error: Prompt blocked by API: {response.prompt_feedback.block_reason}."
 
 
 
 
 
 
 
 
282
 
283
  llm_output = ""
284
- if hasattr(response, 'text') and response.text:
 
285
  llm_output = response.text
286
- elif hasattr(response, 'candidates') and response.candidates:
287
  candidate = response.candidates[0]
288
- if hasattr(candidate, 'content') and candidate.content and hasattr(candidate.content, 'parts') and candidate.content.parts:
289
  llm_output = "".join(part.text for part in candidate.content.parts if hasattr(part, 'text'))
290
- if not llm_output and hasattr(candidate, 'finish_reason'):
291
- return f"# Error: Empty response. Finish reason: {candidate.finish_reason}."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  else:
 
293
  return f"# Error: Unexpected API response structure: {str(response)[:200]}"
 
 
294
  return llm_output
 
 
 
 
 
 
 
295
  except Exception as e:
296
- logging.error(f"Error calling Gemini API via Client: {e}", exc_info=True)
297
  return f"# Error during API call: {type(e).__name__} - {str(e)[:100]}."
298
 
 
299
  async def query(self, prompt_with_query_and_context: str, dataframes_dict: dict, history: list = None) -> str:
300
  llm_response_text = await self._call_gemini_api_async(prompt_with_query_and_context, history)
 
301
  if self.force_sandbox:
302
  code_to_execute = ""
 
303
  if "```python" in llm_response_text:
304
  try:
305
- code_to_execute = llm_response_text.split("```python\n", 1)[1].split("\n```", 1)[0]
306
- except IndexError:
307
- try:
308
- code_to_execute = llm_response_text.split("```python", 1)[1].split("```", 1)[0]
309
- if code_to_execute.startswith("\n"): code_to_execute = code_to_execute[1:]
310
- if code_to_execute.endswith("\n"): code_to_execute = code_to_execute[:-1]
311
- except IndexError: code_to_execute = ""
312
-
313
- if llm_response_text.startswith("# Error:") or not code_to_execute:
 
 
 
 
 
 
 
314
  # If LLM returns an error or no code, pass that through directly.
315
- # The user will see the LLM's error message or its non-code response.
316
- logging.warning(f"LLM response is an error or not code: {llm_response_text}")
317
- return llm_response_text
318
-
319
- logging.info(f"\n--- Code to Execute: ---\n{code_to_execute}\n----------------------\n")
 
 
 
 
 
 
 
 
 
 
320
  from io import StringIO
321
  import sys
322
- old_stdout = sys.stdout; sys.stdout = captured_output = StringIO()
323
- # Ensure dataframes_dict is correctly populated for exec_globals
324
- exec_globals = {'pd': pd, 'np': np}
325
- for name, df_instance in dataframes_dict.items():
326
- if isinstance(df_instance, pd.DataFrame):
327
- exec_globals[f"df_{name}"] = df_instance
328
- else:
329
- logging.warning(f"Item '{name}' in dataframes_dict is not a DataFrame. Skipping for exec_globals.")
330
-
 
 
 
331
  try:
332
- exec(code_to_execute, exec_globals, {})
333
  final_output_str = captured_output.getvalue()
334
- # Check if the output is just whitespace or truly empty
335
- if not final_output_str.strip(): # If only whitespace or empty
336
- # This is where the "no print output" message originates.
337
- # We can now add a more informative message if the code itself ran without error.
338
  logging.info("Code executed successfully, but no explicit print() output was generated by the LLM's code.")
339
- return "# Code executed successfully, but it did not produce any printed output. Please ensure the LLM's Python code includes print() statements for the desired results."
 
 
 
340
  return final_output_str
341
  except Exception as e:
342
- logging.error(f"Sandbox Execution Error: {e}\nCode was:\n{code_to_execute}", exc_info=False)
343
- return f"# Sandbox Execution Error: {type(e).__name__}: {e}\n# --- Code that caused error: ---\n{textwrap.indent(code_to_execute, '# ')}"
344
- finally:
 
 
345
  sys.stdout = old_stdout
346
- else:
347
  return llm_response_text
348
 
349
  # --- Employer Branding Agent ---
350
  class EmployerBrandingAgent:
351
- def __init__(self, llm_model_name: str,
352
- generation_config_dict: dict,
353
- safety_settings_list: list,
354
- all_dataframes: dict,
355
- rag_documents_df: pd.DataFrame,
356
  embedding_model_name: str,
357
  data_privacy=True, force_sandbox=True):
358
 
359
  self.pandas_llm = PandasLLM(
360
- llm_model_name,
361
- generation_config_dict,
362
- safety_settings_list,
363
- data_privacy,
364
  force_sandbox
365
  )
366
  self.rag_system = AdvancedRAGSystem(rag_documents_df, embedding_model_name)
367
- self.all_dataframes = all_dataframes
368
  self.schemas_representation = get_all_schemas_representation(self.all_dataframes)
369
- self.chat_history = []
370
- logging.info("EmployerBrandingAgent Initialized with updated safety settings handling.")
371
 
372
- def _build_prompt(self, user_query: str, role="Employer Branding Analyst", task_decomposition_hint=None, cot_hint=True) -> str:
373
- prompt = f"You are a helpful and expert '{role}'. Your primary goal is to assist with analyzing LinkedIn-related data using Pandas DataFrames.\n"
 
 
374
  prompt += "You will be provided with schemas for available Pandas DataFrames and a user query.\n"
375
 
376
  if self.pandas_llm.data_privacy:
377
- prompt += "IMPORTANT: Be mindful of data privacy. Do not output raw Personally Identifiable Information (PII) like names or specific user details unless explicitly asked and absolutely necessary for the query. Summarize or aggregate data where possible.\n"
378
 
379
  if self.pandas_llm.force_sandbox:
380
- prompt += "Your main task is to GENERATE PYTHON CODE using the Pandas library to answer the user query based on the provided DataFrames. Output ONLY the Python code block.\n"
 
 
381
  prompt += "The available DataFrames are already loaded and can be accessed by their dictionary keys prefixed with 'df_' (e.g., df_follower_stats, df_posts) within the execution environment.\n"
382
  prompt += "Example of accessing a DataFrame: `df_follower_stats['country']`.\n"
383
- prompt += "CRITICAL INSTRUCTION: Your Python code MUST include `print()` statements for ANY results, DataFrames, or values that should be displayed as the answer to the user's query. The output of these `print()` statements will be the final answer shown to the user.\n"
384
- prompt += "If you define a function to perform the analysis, you MUST call this function with the appropriate DataFrame(s) and `print()` its returned value. Do not just define functions without executing them and printing their results.\n"
385
- prompt += "If the query is simple and the result is a single value or a small piece of information, compute it and `print()` it directly.\n"
386
- prompt += "For example, if asked for 'total followers', your code should end with something like `print(total_followers)` or `print(df_result.to_string())`.\n"
387
-
388
- prompt += "If a column contains lists (e.g., 'skills' in a hypothetical 'df_employees'), you might need to use methods like `.explode()` or `.apply(pd.Series)` or `.apply(lambda x: ...)` for analysis.\n"
389
- prompt += "If the query is ambiguous or requires clarification, ask for it instead of making assumptions. If the query cannot be answered with the given data, state that clearly in a comment within the code block (e.g. `# Cannot answer: data not available`).\n"
390
- prompt += "If the query is not about data analysis or code generation (e.g. 'hello', 'how are you?'), respond politely and briefly in a comment, do not attempt to generate code (e.g. `# Hello there! How can I help you with data analysis today?`).\n"
391
- prompt += "Structure your code clearly. Add comments (#) to explain each step of your logic.\n"
392
- else:
393
- prompt += "Your task is to analyze the data and provide a comprehensive textual answer to the user query. You can explain your reasoning step-by-step.\n"
 
 
 
 
 
 
394
 
395
  prompt += "\n--- AVAILABLE DATA AND SCHEMAS ---\n"
396
- prompt += self.schemas_representation
 
 
 
397
 
398
  rag_context = self.rag_system.retrieve_relevant_info(user_query)
399
- if rag_context and "[RAG Context]" in rag_context and "No specific pre-defined context found" not in rag_context and "No highly relevant passages found" not in rag_context and "Embeddings not generated" not in rag_context:
400
- prompt += f"\n--- ADDITIONAL CONTEXT (from internal knowledge base, consider this information) ---\n{rag_context}\n"
 
 
 
 
 
 
 
401
 
402
  prompt += f"\n--- USER QUERY ---\n{user_query}\n"
403
 
@@ -406,40 +704,163 @@ class EmployerBrandingAgent:
406
 
407
  if cot_hint:
408
  if self.pandas_llm.force_sandbox:
409
- prompt += "\n--- INSTRUCTIONS FOR PYTHON CODE GENERATION (Chain of Thought & Output) ---\n"
410
- prompt += "1. Understand the query: What specific information is requested?\n"
411
- prompt += "2. Identify relevant DataFrame(s) and column(s) from the schemas provided.\n"
412
- prompt += "3. Plan the steps: Outline the Pandas operations needed (filtering, grouping, aggregation, merging, etc.) as comments in your code.\n"
413
- prompt += "4. Write the code: Implement the steps using Pandas. Remember to use `df_name_of_dataframe` (e.g. `df_follower_stats`).\n"
414
- prompt += "5. CRITICAL - Ensure output: Call any functions you define and use `print()` for ALL results that should be displayed. For DataFrames, you can print the DataFrame directly (e.g., `print(my_result_df)`), or `print(df.to_string())` if it might be large. For single values, `print(my_value)`.\n"
415
- prompt += "6. Review: Check for correctness, efficiency, and adherence to the prompt (especially the CRITICAL `print()` requirement for the final answer).\n"
416
- prompt += "7. Generate ONLY the Python code block starting with ```python and ending with ```. No explanations outside the code block's comments.\n"
417
- else:
418
- prompt += "\n--- INSTRUCTIONS FOR RESPONSE (Chain of Thought) ---\n"
419
- prompt += "Please provide a step-by-step explanation of your analysis before giving the final answer.\n"
 
 
 
 
 
 
420
 
421
  return prompt
422
 
423
- async def process_query(self, user_query: str, role="Employer Branding Analyst", task_decomposition_hint=None, cot_hint=True) -> str:
424
- self.chat_history.append({"role": "user", "content": user_query})
 
 
 
 
 
 
425
  full_prompt = self._build_prompt(user_query, role, task_decomposition_hint, cot_hint)
426
 
427
- logging.info(f"Full prompt to LLM (last 300 chars of user query part for brevity in log): ... {full_prompt[-500:]}") # Log end of prompt
 
 
428
 
429
- response_text = await self.pandas_llm.query(full_prompt, self.all_dataframes, history=self.chat_history[:-1])
430
- self.chat_history.append({"role": "assistant", "content": response_text})
 
 
 
431
 
432
- MAX_HISTORY_TURNS = 5
433
  if len(self.chat_history) > MAX_HISTORY_TURNS * 2:
 
434
  self.chat_history = self.chat_history[-(MAX_HISTORY_TURNS * 2):]
 
435
 
436
  return response_text
437
 
438
- def update_dataframes(self, new_dataframes: dict):
439
- self.all_dataframes = new_dataframes
440
  self.schemas_representation = get_all_schemas_representation(self.all_dataframes)
441
- logging.info("EmployerBrandingAgent DataFrames updated.")
442
- def clear_chat_history(self):
 
 
 
443
  self.chat_history = []
444
  logging.info("EmployerBrandingAgent chat history cleared.")
445
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  # Attempt to import Google Generative AI and related types
11
  try:
12
+ from google import generativeai as genai # Renamed for clarity to avoid conflict
13
+ from google.generativeai import types as genai_types
14
+ # from google.generativeai import GenerationConfig # For direct use if needed
15
+ # from google.generativeai.types import HarmCategory, HarmBlockThreshold, SafetySetting # For direct use
16
+
17
  except ImportError:
18
  print("Google Generative AI library not found. Please install it: pip install google-generativeai")
19
  # Define dummy classes/functions if the import fails, to allow the rest of the script to be parsed
20
  class genai: # type: ignore
21
  @staticmethod
22
+ def configure(api_key):
23
+ print(f"Dummy genai.configure called with API key: {'SET' if api_key else 'NOT SET'}")
24
+
25
+ # Dummy Client and related structures
26
+ class Client:
27
+ def __init__(self, api_key=None):
28
+ self.api_key = api_key
29
+ self.models = self._Models()
30
+ print(f"Dummy genai.Client initialized {'with' if api_key else 'without'} API key.")
31
+
32
+ class _Models:
33
  @staticmethod
34
+ async def generate_content_async(model=None, contents=None, generation_config=None, safety_settings=None, stream=False): # Matched real signature better
35
+ print(f"Dummy genai.Client.models.generate_content_async called for model: {model} with config: {generation_config}, safety_settings: {safety_settings}, stream: {stream}")
36
  class DummyPart:
37
  def __init__(self, text): self.text = text
38
  class DummyContent:
39
+ def __init__(self): self.parts = [DummyPart("# Dummy response from dummy client's async generate_content")]
40
  class DummyCandidate:
41
  def __init__(self):
42
  self.content = DummyContent()
43
+ self.finish_reason = genai_types.FinishReason.STOP # Use dummy FinishReason
44
+ self.safety_ratings = []
45
+ self.token_count = 0 # Added
46
+ self.index = 0 # Added
47
  class DummyResponse:
48
  def __init__(self):
49
  self.candidates = [DummyCandidate()]
50
+ self.prompt_feedback = self._PromptFeedback() # Use dummy PromptFeedback
51
+ self.text = "# Dummy response text from dummy client's async generate_content" # for easier access
52
+ class _PromptFeedback: # Nested dummy class
53
+ def __init__(self):
54
+ self.block_reason = None
55
+ self.safety_ratings = []
56
  return DummyResponse()
57
 
58
+ def generate_content(self, model=None, contents=None, generation_config=None, safety_settings=None, stream=False): # Matched real signature better
59
+ print(f"Dummy genai.Client.models.generate_content called for model: {model} with config: {generation_config}, safety_settings: {safety_settings}, stream: {stream}")
60
+ # Re-using the async dummy structure for simplicity
61
+ class DummyPart:
62
+ def __init__(self, text): self.text = text
63
+ class DummyContent:
64
+ def __init__(self): self.parts = [DummyPart("# Dummy response from dummy client's generate_content")]
65
+ class DummyCandidate:
66
+ def __init__(self):
67
+ self.content = DummyContent()
68
+ self.finish_reason = genai_types.FinishReason.STOP # Use dummy FinishReason
69
+ self.safety_ratings = []
70
+ self.token_count = 0
71
+ self.index = 0
72
+ class DummyResponse:
73
+ def __init__(self):
74
+ self.candidates = [DummyCandidate()]
75
+ self.prompt_feedback = self._PromptFeedback() # Use dummy PromptFeedback
76
+ self.text = "# Dummy response text from dummy client's generate_content"
77
+ class _PromptFeedback:
78
+ def __init__(self):
79
+ self.block_reason = None
80
+ self.safety_ratings = []
81
+ return DummyResponse()
82
 
83
  @staticmethod
84
+ def GenerativeModel(model_name, generation_config=None, safety_settings=None, system_instruction=None): # Matched real signature
85
+ print(f"Dummy genai.GenerativeModel called for model: {model_name} with config: {generation_config}, safety: {safety_settings}, system_instruction: {system_instruction}")
86
+ class DummyGenerativeModel:
87
+ def __init__(self, model_name_in, generation_config_in, safety_settings_in, system_instruction_in):
88
+ self.model_name = model_name_in
89
+ self.generation_config = generation_config_in
90
+ self.safety_settings = safety_settings_in
91
+ self.system_instruction = system_instruction_in
92
+ async def generate_content_async(self, contents, stream=False): # Matched real signature
93
+ print(f"Dummy GenerativeModel.generate_content_async called for {self.model_name}")
94
+ # Simplified response, similar to Client's dummy
95
+ class DummyPart:
96
+ def __init__(self, text): self.text = text
97
+ class DummyContent:
98
+ def __init__(self): self.parts = [DummyPart(f"# Dummy response from dummy GenerativeModel ({self.model_name})")]
99
+ class DummyCandidate:
100
+ def __init__(self):
101
+ self.content = DummyContent()
102
+ self.finish_reason = genai_types.FinishReason.STOP
103
+ self.safety_ratings = []
104
+ class DummyResponse:
105
+ def __init__(self):
106
+ self.candidates = [DummyCandidate()]
107
+ self.prompt_feedback = None
108
+ self.text = f"# Dummy response text from dummy GenerativeModel ({self.model_name})"
109
+ return DummyResponse()
110
+
111
+ def generate_content(self, contents, stream=False): # Matched real signature
112
+ print(f"Dummy GenerativeModel.generate_content called for {self.model_name}")
113
+ # Simplified response, similar to Client's dummy
114
+ class DummyPart:
115
+ def __init__(self, text): self.text = text
116
+ class DummyContent:
117
+ def __init__(self): self.parts = [DummyPart(f"# Dummy response from dummy GenerativeModel ({self.model_name})")]
118
+ class DummyCandidate:
119
+ def __init__(self):
120
+ self.content = DummyContent()
121
+ self.finish_reason = genai_types.FinishReason.STOP
122
+ self.safety_ratings = []
123
+ class DummyResponse:
124
+ def __init__(self):
125
+ self.candidates = [DummyCandidate()]
126
+ self.prompt_feedback = None
127
+ self.text = f"# Dummy response text from dummy GenerativeModel ({self.model_name})"
128
+ return DummyResponse()
129
+
130
+ return DummyGenerativeModel(model_name, generation_config, safety_settings, system_instruction)
131
+
132
  @staticmethod
133
+ def embed_content(model, content, task_type, title=None):
134
+ print(f"Dummy genai.embed_content called for model: {model}, task_type: {task_type}, title: {title}")
135
+ # Ensure the dummy embedding matches typical dimensions (e.g., 768 for many models)
136
+ return {"embedding": [0.1] * 768}
137
 
138
  class genai_types: # type: ignore
139
+ # Using dicts for dummy GenerationConfig and SafetySetting for simplicity
140
  @staticmethod
141
+ def GenerationConfig(**kwargs): # The dummy now just returns the kwargs as a dict
142
+ print(f"Dummy genai_types.GenerationConfig created with: {kwargs}")
143
+ return dict(kwargs)
144
+
 
145
  @staticmethod
146
  def SafetySetting(category, threshold):
147
  print(f"Dummy SafetySetting created: category={category}, threshold={threshold}")
148
  return {"category": category, "threshold": threshold} # Return a dict for dummy
149
 
150
+ # Dummy Enums (can be simple string attributes)
151
+ class HarmCategory:
 
152
  HARM_CATEGORY_UNSPECIFIED = "HARM_CATEGORY_UNSPECIFIED"
153
  HARM_CATEGORY_HARASSMENT = "HARM_CATEGORY_HARASSMENT"
154
  HARM_CATEGORY_HATE_SPEECH = "HARM_CATEGORY_HATE_SPEECH"
155
  HARM_CATEGORY_SEXUALLY_EXPLICIT = "HARM_CATEGORY_SEXUALLY_EXPLICIT"
156
  HARM_CATEGORY_DANGEROUS_CONTENT = "HARM_CATEGORY_DANGEROUS_CONTENT"
157
+
158
  class HarmBlockThreshold:
159
  BLOCK_NONE = "BLOCK_NONE"
160
  BLOCK_LOW_AND_ABOVE = "BLOCK_LOW_AND_ABOVE"
161
  BLOCK_MEDIUM_AND_ABOVE = "BLOCK_MEDIUM_AND_ABOVE"
162
  BLOCK_ONLY_HIGH = "BLOCK_ONLY_HIGH"
163
 
164
+ class FinishReason: # Added dummy FinishReason
165
+ FINISH_REASON_UNSPECIFIED = "FINISH_REASON_UNSPECIFIED"
166
+ STOP = "STOP"
167
+ MAX_TOKENS = "MAX_TOKENS"
168
+ SAFETY = "SAFETY"
169
+ RECITATION = "RECITATION"
170
+ OTHER = "OTHER"
171
+
172
+ # Placeholder for other types if needed by the script
173
+ # class BlockReason:
174
+ # SAFETY = "SAFETY"
175
+
176
 
177
  # --- Configuration ---
178
+ GEMINI_API_KEY = os.getenv('GEMINI_API_KEY', "")
179
+ # Recommended: Use a standard, publicly available model name.
180
+ # LLM_MODEL_NAME = "gemini-2.0-flash" # Original
181
  LLM_MODEL_NAME = "gemini-2.0-flash"
182
  GEMINI_EMBEDDING_MODEL_NAME = "gemini-embedding-exp-03-07"
183
 
184
+ # Base generation configuration for the LLM
185
  GENERATION_CONFIG_PARAMS = {
186
+ "temperature": 0.3, # Slightly increased for more varied insights, adjust as needed
187
  "top_p": 1.0,
188
  "top_k": 32,
189
+ "max_output_tokens": 8192, # Increased for potentially longer code with comments and insights
190
+ # "candidate_count": 1, # Default is 1, explicitly setting it
191
  }
192
 
193
  # Default safety settings list for Gemini
 
194
  try:
195
+ DEFAULT_SAFETY_SETTINGS = [
196
  genai_types.SafetySetting(
197
  category=genai_types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
198
+ threshold=genai_types.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE, # Adjusted slightly
199
  ),
200
  genai_types.SafetySetting(
201
  category=genai_types.HarmCategory.HARM_CATEGORY_HARASSMENT,
202
+ threshold=genai_types.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE, # Adjusted slightly
203
  ),
204
  genai_types.SafetySetting(
205
  category=genai_types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
206
+ threshold=genai_types.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE, # Adjusted slightly
207
  ),
208
  genai_types.SafetySetting(
209
  category=genai_types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
210
+ threshold=genai_types.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE, # Adjusted slightly
211
  ),
212
  ]
213
+ except AttributeError as e:
214
  logging.warning(f"Could not define DEFAULT_SAFETY_SETTINGS using real genai_types: {e}. Using placeholder list of dicts.")
215
  DEFAULT_SAFETY_SETTINGS = [
216
+ {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
217
+ {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
218
+ {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
219
+ {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
220
  ]
221
 
222
 
223
  # Logging setup
224
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(module)s - %(filename)s:%(lineno)d - %(message)s')
225
 
226
  if GEMINI_API_KEY:
227
  try:
228
  genai.configure(api_key=GEMINI_API_KEY)
229
+ logging.info(f"Gemini API key configured globally.")
230
  except Exception as e:
231
  logging.error(f"Failed to configure Gemini API globally: {e}", exc_info=True)
232
  else:
233
+ logging.warning("GEMINI_API_KEY environment variable not set. Agent will use dummy responses if real genai library is not fully mocked or if API calls fail.")
234
 
235
 
236
+ # --- RAG Documents Definition (Example) ---
237
  rag_documents_data = {
238
+ 'Title': [
239
+ "Employer Branding Best Practices 2024",
240
+ "Attracting Tech Talent in Competitive Markets",
241
+ "The Power of Employee Advocacy",
242
+ "Understanding Gen Z Workforce Expectations"
243
+ ],
244
+ 'Text': [
245
+ "Focus on authentic employee stories and showcase company culture. Highlight diversity and inclusion initiatives. Use video content for higher engagement. Clearly articulate your Employee Value Proposition (EVP).",
246
+ "Tech candidates value challenging projects, continuous learning opportunities, and a flexible work environment. Competitive compensation and modern tech stacks are crucial. Highlight your company's innovation and impact.",
247
+ "Encourage employees to share their positive experiences on social media. Provide them with shareable content and guidelines. Employee-generated content is often perceived as more trustworthy than corporate messaging.",
248
+ "Gen Z values purpose-driven work, transparency, mental health support, and opportunities for growth. They are digital natives and expect seamless online application processes. They also care deeply about social responsibility."
249
+ ]
250
+ }
251
  df_rag_documents = pd.DataFrame(rag_documents_data)
252
 
253
  # --- Schema Representation ---
254
  def get_schema_representation(df_name: str, df: pd.DataFrame) -> str:
255
+ if not isinstance(df, pd.DataFrame):
256
+ return f"Schema for item '{df_name}': Not a DataFrame.\n"
257
+ if df.empty:
258
+ return f"Schema for DataFrame 'df_{df_name}': Empty (no columns or rows).\n"
259
+
260
+ schema_str = f"DataFrame 'df_{df_name}':\n"
261
+ schema_str += f" Columns: {df.columns.tolist()}\n"
262
+ schema_str += f" Shape: {df.shape}\n"
263
+ # Add dtypes for more clarity
264
+ # schema_str += " Data Types:\n"
265
+ # for col in df.columns:
266
+ # schema_str += f" {col}: {df[col].dtype}\n"
267
+
268
+ # Sample data (first 2 rows)
269
+ if not df.empty:
270
+ sample_str = df.head(2).to_string()
271
+ # Indent sample string for better readability in the prompt
272
+ indented_sample = "\n".join([" " + line for line in sample_str.splitlines()])
273
+ schema_str += f" Sample Data (first 2 rows):\n{indented_sample}\n"
274
+ else:
275
+ schema_str += " Sample Data: DataFrame is empty.\n"
276
+ return schema_str
277
+
278
  def get_all_schemas_representation(dataframes_dict: dict) -> str:
279
+ if not dataframes_dict:
280
+ return "No DataFrames provided.\n"
281
+ return "".join(get_schema_representation(name, df) for name, df in dataframes_dict.items())
282
 
283
 
284
  # --- Advanced RAG System ---
285
+ class AdvancedRAGSystem:
286
  def __init__(self, documents_df: pd.DataFrame, embedding_model_name: str):
287
  self.embedding_model_name = embedding_model_name
288
  self.documents_df = documents_df.copy()
289
+ self.embeddings_generated = False
290
+ self.client_available = hasattr(genai, 'embed_content') and not (hasattr(genai.embed_content, '__func__') and genai.embed_content.__func__.__qualname__.startswith('genai.embed_content')) # Check if it's not the dummy
291
+
292
+ if GEMINI_API_KEY and self.client_available:
293
  try:
294
+ self._precompute_embeddings()
295
  self.embeddings_generated = True
296
+ logging.info(f"RAG embeddings precomputed using '{self.embedding_model_name}'.")
297
+ except Exception as e:
298
+ logging.error(f"RAG precomputation error: {e}", exc_info=True)
299
  else:
300
+ logging.warning(f"RAG embeddings not precomputed. GEMINI_API_KEY set: {bool(GEMINI_API_KEY)}, genai.embed_content available: {self.client_available}.")
301
 
302
+ def _embed_fn(self, title: str, text: str) -> list[float]:
303
+ if not self.embeddings_generated: # Should rely on self.client_available too
304
+ # logging.debug(f"Skipping embedding for '{title}' as embeddings are not active.")
305
+ return [0.0] * 768 # Default dimension, adjust if your model differs
306
  try:
307
+ # logging.debug(f"Embedding '{title}' with model '{self.embedding_model_name}'")
308
+ # Ensure content is not empty
309
+ content_to_embed = text if text else title
310
+ if not content_to_embed:
311
+ logging.warning(f"Cannot embed '{title}' because both title and text are empty.")
312
+ return [0.0] * 768
313
+
314
+ embedding_result = genai.embed_content(
315
+ model=self.embedding_model_name,
316
+ content=content_to_embed,
317
+ task_type="retrieval_document",
318
+ title=title if title else None # Pass title only if it exists
319
+ )
320
+ return embedding_result["embedding"]
321
  except Exception as e:
322
+ logging.error(f"Error in _embed_fn for '{title}': {e}", exc_info=True)
323
  return [0.0] * 768
324
+
325
+ def _precompute_embeddings(self):
326
  if 'Embeddings' not in self.documents_df.columns:
327
  self.documents_df['Embeddings'] = pd.Series(dtype='object')
328
+
329
+ # Ensure there's text to embed
330
+ mask = (self.documents_df['Text'].notna() & (self.documents_df['Text'] != '')) | \
331
+ (self.documents_df['Title'].notna() & (self.documents_df['Title'] != ''))
332
+
333
+ if not mask.any():
334
+ logging.warning("No content found in 'Text' or 'Title' columns to generate embeddings.")
335
+ return
336
+
337
+ self.documents_df.loc[mask, 'Embeddings'] = self.documents_df[mask].apply(
338
+ lambda row: self._embed_fn(row.get('Title', ''), row.get('Text', '')), axis=1
339
+ )
340
+ logging.info(f"Applied embedding function to {mask.sum()} rows.")
341
+
342
+
343
+ def retrieve_relevant_info(self, query_text: str, top_k: int = 2) -> str: # Increased top_k for more context
344
+ if not self.client_available:
345
+ return "\n[RAG Context]\nEmbedding client not available. Cannot retrieve RAG context.\n"
346
+ if not self.embeddings_generated or 'Embeddings' not in self.documents_df.columns or self.documents_df['Embeddings'].isnull().all():
347
+ return "\n[RAG Context]\nEmbeddings not generated or all are null. No RAG context available.\n"
348
+
349
  try:
350
+ query_embedding_response = genai.embed_content(
351
+ model=self.embedding_model_name,
352
+ content=query_text,
353
+ task_type="retrieval_query"
354
+ )
355
+ query_embedding = np.array(query_embedding_response["embedding"])
356
+
357
  valid_embeddings_df = self.documents_df.dropna(subset=['Embeddings'])
358
+ valid_embeddings_df = valid_embeddings_df[valid_embeddings_df['Embeddings'].apply(lambda x: isinstance(x, (list, np.ndarray)) and len(x) > 0)]
359
+
360
+ if valid_embeddings_df.empty:
361
+ return "\n[RAG Context]\nNo valid document embeddings for RAG.\n"
362
 
 
 
363
  document_embeddings = np.stack(valid_embeddings_df['Embeddings'].apply(np.array).values)
364
+
365
+ if query_embedding.shape[0] != document_embeddings.shape[1]:
366
+ logging.error(f"Embedding dimension mismatch. Query: {query_embedding.shape[0]}, Docs: {document_embeddings.shape[1]}")
367
+ return "\n[RAG Context]\nEmbedding dimension mismatch. Cannot calculate similarity.\n"
368
 
369
  dot_products = np.dot(document_embeddings, query_embedding)
370
+ # Get indices of top_k largest dot products
371
+ # If fewer valid documents than top_k, take all of them
372
+ num_to_retrieve = min(top_k, len(valid_embeddings_df))
373
+ if num_to_retrieve == 0: # Should be caught by valid_embeddings_df.empty earlier
374
+ return "\n[RAG Context]\nNo relevant passages found (num_to_retrieve is 0).\n"
375
+
376
+ # Ensure indices are within bounds
377
+ idx = np.argsort(dot_products)[-num_to_retrieve:][::-1] # Top N, descending order
378
+
379
+ relevant_passages = ""
380
+ for i in idx:
381
+ if i < len(valid_embeddings_df): # Defensive check
382
+ doc = valid_embeddings_df.iloc[i]
383
+ relevant_passages += f"\n[RAG Context from: '{doc['Title']}']\n{doc['Text']}\n"
384
+ else:
385
+ logging.warning(f"Index {i} out of bounds for valid_embeddings_df (len {len(valid_embeddings_df)})")
386
+
387
+
388
+ return relevant_passages if relevant_passages else "\n[RAG Context]\nNo relevant passages found after similarity search.\n"
389
  except Exception as e:
390
+ logging.error(f"Error in RAG retrieve_relevant_info: {e}", exc_info=True)
391
+ return f"\n[RAG Context]\nError during RAG retrieval: {type(e).__name__} - {e}\n"
392
 
393
 
394
  # --- PandasLLM Class (Gemini-Powered) ---
395
  class PandasLLM:
396
+ def __init__(self, llm_model_name: str,
397
+ generation_config_dict: dict,
398
+ safety_settings_list: list,
399
  data_privacy=True, force_sandbox=True):
400
  self.llm_model_name = llm_model_name
401
+ self.generation_config_dict = generation_config_dict
402
+ self.safety_settings_list = safety_settings_list
403
  self.data_privacy = data_privacy
404
  self.force_sandbox = force_sandbox
405
+ self.generative_model = None # Will hold the GenerativeModel instance
 
406
 
407
  if not GEMINI_API_KEY:
408
+ logging.warning(f"PandasLLM: GEMINI_API_KEY not set. Using dummy model if real 'genai' is not fully mocked.")
409
+ # Even if API key is not set, we might be using a dummy genai
410
+ # So, initialize the dummy model if genai.GenerativeModel is the dummy one
411
+ if hasattr(genai, 'GenerativeModel') and hasattr(genai.GenerativeModel, '__func__') and genai.GenerativeModel.__func__.__qualname__.startswith('genai.GenerativeModel'): # Heuristic for dummy
412
+ self.generative_model = genai.GenerativeModel(
413
+ model_name=self.llm_model_name,
414
+ generation_config=genai_types.GenerationConfig(**self.generation_config_dict) if self.generation_config_dict else None,
415
+ safety_settings=self.safety_settings_list
416
+ )
417
+ logging.info(f"PandasLLM: Initialized with DUMMY genai.GenerativeModel for '{self.llm_model_name}'.")
418
+
419
+ else: # GEMINI_API_KEY is set
420
  try:
421
+ # Use genai_types.GenerationConfig for real API
422
+ config_for_model = genai_types.GenerationConfig(**self.generation_config_dict) if self.generation_config_dict else None
423
+
424
+ self.generative_model = genai.GenerativeModel(
425
+ model_name=self.llm_model_name, # The SDK handles the "models/" prefix
426
+ generation_config=config_for_model,
427
+ safety_settings=self.safety_settings_list
428
+ # system_instruction can be added here if needed globally for this model
429
+ )
430
+ logging.info(f"PandasLLM: Initialized with REAL genai.GenerativeModel for '{self.llm_model_name}'.")
431
  except Exception as e:
432
+ logging.error(f"Failed to initialize PandasLLM with genai.GenerativeModel: {e}", exc_info=True)
433
+ # Fallback to dummy if real initialization fails, to prevent crashes
434
+ if hasattr(genai, 'GenerativeModel') and hasattr(genai.GenerativeModel, '__func__') and genai.GenerativeModel.__func__.__qualname__.startswith('genai.GenerativeModel'):
435
+ self.generative_model = genai.GenerativeModel(model_name=self.llm_model_name) # Basic dummy
436
+ logging.warning("PandasLLM: Falling back to DUMMY genai.GenerativeModel due to real initialization error.")
437
 
 
 
 
438
 
439
+ async def _call_gemini_api_async(self, prompt_text: str, history: list = None) -> str:
440
+ if not self.generative_model:
441
+ logging.error("PandasLLM: GenerativeModel not available (or not initialized). Cannot call API.")
442
+ return "# Error: Gemini model not available for API call."
443
+
444
+ # Gemini API expects chat history in a specific format
445
+ # The 'contents' parameter should be a list of Content objects (dicts)
446
+ # For chat, this list often alternates between 'user' and 'model' roles.
447
+ # The final part of 'contents' should be the current user prompt.
448
+
449
+ # Convert simple history to Gemini's expected format
450
+ gemini_history = []
451
  if history:
452
  for entry in history:
453
  role = "model" if entry.get("role") == "assistant" else entry.get("role", "user")
454
+ gemini_history.append({"role": role, "parts": [{"text": entry.get("content", "")}]})
 
455
 
456
+ # Add current prompt as the last user message
457
+ current_content = [{"role": "user", "parts": [{"text": prompt_text}]}]
458
+
459
+ # The 'contents' for generate_content should be the full conversation history + current prompt
460
+ # If there's history, it's usually passed to start_chat, but for one-off generate_content,
461
+ # we might need to construct it carefully.
462
+ # For non-chat models or simpler generate_content, just the prompt might be enough.
463
+ # The GenerativeModel().generate_content typically takes 'contents' which can be a string,
464
+ # a Part, or a list of Parts for the current turn.
465
+ # For chat-like behavior with generate_content, the 'contents' list should represent the conversation.
466
+
467
+ # Let's assume the prompt_text is the primary input for this turn.
468
+ # If the model is a chat model, the history needs to be managed carefully.
469
+ # For now, we'll pass the prompt_text as the main content for this turn.
470
+ # The `history` parameter here is for our internal tracking,
471
+ # the `GenerativeModel` might handle history differently (e.g. via `start_chat`).
472
+ # For a direct `generate_content` call, we typically provide the current turn's content.
473
+ # If we want to provide history, it should be part of the `contents` list.
474
+
475
+ contents_for_api = gemini_history + current_content # This forms the conversation
476
+
477
+ logging.info(f"\n--- Calling Gemini API (model: {self.llm_model_name}) ---\nContent (last part): {contents_for_api[-1]['parts'][0]['text'][:200]}...\n")
478
 
479
  try:
480
+ # The GenerativeModel instance already has config and safety settings.
481
+ # We just pass the 'contents'.
482
+ response = await self.generative_model.generate_content_async(
 
 
 
 
483
  contents=contents_for_api,
484
+ # generation_config, safety_settings are already part of self.generative_model
485
  )
486
 
487
+ if hasattr(response, 'prompt_feedback') and response.prompt_feedback and \
488
+ hasattr(response.prompt_feedback, 'block_reason') and response.prompt_feedback.block_reason:
489
+ block_reason_val = response.prompt_feedback.block_reason
490
+ # Try to get enum name if available
491
+ try:
492
+ block_reason_str = genai_types.BlockedReason(block_reason_val).name
493
+ except:
494
+ block_reason_str = str(block_reason_val)
495
+ logging.warning(f"Prompt blocked by API. Reason: {block_reason_str}. Ratings: {response.prompt_feedback.safety_ratings}")
496
+ return f"# Error: Prompt blocked by API. Reason: {block_reason_str}."
497
 
498
  llm_output = ""
499
+ # Standard way to get text from Gemini response
500
+ if hasattr(response, 'text') and isinstance(response.text, str):
501
  llm_output = response.text
502
+ elif response.candidates:
503
  candidate = response.candidates[0]
504
+ if candidate.content and candidate.content.parts:
505
  llm_output = "".join(part.text for part in candidate.content.parts if hasattr(part, 'text'))
506
+
507
+ if not llm_output and candidate.finish_reason:
508
+ finish_reason_val = candidate.finish_reason
509
+ try:
510
+ # finish_reason_str = genai_types.FinishReason(finish_reason_val).name # This might fail if finish_reason_val is an int
511
+ finish_reason_str = str(finish_reason_val) # Safer for now
512
+ # For real API, finish_reason is an enum member, so .name would work.
513
+ # For dummy, it might be a string already.
514
+ if hasattr(genai_types.FinishReason, '_enum_map_') and finish_reason_val in genai_types.FinishReason._enum_map_: # Check if it's a valid enum value
515
+ finish_reason_str = genai_types.FinishReason(finish_reason_val).name
516
+
517
+ except Exception as fre:
518
+ logging.debug(f"Could not get FinishReason name: {fre}")
519
+ finish_reason_str = str(finish_reason_val)
520
+
521
+ # Check if blocked due to safety
522
+ if finish_reason_str == "SAFETY": # or candidate.finish_reason == genai_types.FinishReason.SAFETY:
523
+ safety_messages = []
524
+ if candidate.safety_ratings:
525
+ for rating in candidate.safety_ratings:
526
+ cat_name = rating.category.name if hasattr(rating.category, 'name') else str(rating.category)
527
+ prob_name = rating.probability.name if hasattr(rating.probability, 'name') else str(rating.probability)
528
+ safety_messages.append(f"Category: {cat_name}, Probability: {prob_name}")
529
+ logging.warning(f"Content generation stopped due to safety. Finish reason: {finish_reason_str}. Details: {'; '.join(safety_messages)}")
530
+ return f"# Error: Content generation stopped by API due to safety. Finish Reason: {finish_reason_str}. Details: {'; '.join(safety_messages)}"
531
+
532
+ logging.warning(f"Empty response from LLM. Finish reason: {finish_reason_str}.")
533
+ return f"# Error: LLM returned an empty response. Finish reason: {finish_reason_str}."
534
  else:
535
+ logging.error(f"Unexpected API response structure: {str(response)[:500]}")
536
  return f"# Error: Unexpected API response structure: {str(response)[:200]}"
537
+
538
+ # logging.debug(f"LLM Raw Output:\n{llm_output}")
539
  return llm_output
540
+
541
+ except genai_types.BlockedPromptException as bpe: # Specific exception for blocked prompts
542
+ logging.error(f"Prompt was blocked by the API (BlockedPromptException): {bpe}", exc_info=True)
543
+ return f"# Error: Your prompt was blocked by the API. Please revise. Details: {bpe.prompt_feedback}"
544
+ except genai_types.StopCandidateException as sce: # Specific exception for candidate stopped
545
+ logging.error(f"Candidate generation stopped (StopCandidateException): {sce}", exc_info=True)
546
+ return f"# Error: Content generation was stopped. Details: {sce.candidate}"
547
  except Exception as e:
548
+ logging.error(f"Error calling Gemini API: {e}", exc_info=True)
549
  return f"# Error during API call: {type(e).__name__} - {str(e)[:100]}."
550
 
551
+
552
  async def query(self, prompt_with_query_and_context: str, dataframes_dict: dict, history: list = None) -> str:
553
  llm_response_text = await self._call_gemini_api_async(prompt_with_query_and_context, history)
554
+
555
  if self.force_sandbox:
556
  code_to_execute = ""
557
+ # Robust code extraction
558
  if "```python" in llm_response_text:
559
  try:
560
+ # Standard ```python\nCODE\n```
561
+ code_block_match = llm_response_text.split("```python\n", 1)
562
+ if len(code_block_match) > 1:
563
+ code_to_execute = code_block_match[1].split("\n```", 1)[0]
564
+ else: # Try without newline after ```python
565
+ code_block_match = llm_response_text.split("```python", 1)
566
+ if len(code_block_match) > 1:
567
+ code_to_execute = code_block_match[1].split("```", 1)[0]
568
+ if code_to_execute.startswith("\n"): # Remove leading newline if present
569
+ code_to_execute = code_to_execute[1:]
570
+
571
+ except IndexError:
572
+ code_to_execute = "" # Should not happen with proper split logic
573
+
574
+ if llm_response_text.startswith("# Error:") or not code_to_execute.strip():
575
+ logging.warning(f"LLM response is an error, or no valid Python code block found. Raw LLM response: {llm_response_text}")
576
  # If LLM returns an error or no code, pass that through directly.
577
+ # Or if it's a polite non-code refusal (e.g. "# Hello there! ...")
578
+ if not code_to_execute.strip() and not llm_response_text.startswith("# Error:"):
579
+ # This means LLM might have responded with natural language instead of code.
580
+ # If force_sandbox is true, we expect code. If it's not code, it's a deviation.
581
+ # However, the prompt allows for comments like "# Hello there..."
582
+ # So, if it's just comments, that's fine.
583
+ # If it's substantial text without code delimiters, that's an issue for sandbox mode.
584
+ if "```" not in llm_response_text and len(llm_response_text.strip()) > 0: # Heuristic for non-code text
585
+ logging.info(f"LLM produced text output instead of Python code in sandbox mode. Passing through: {llm_response_text}")
586
+ # This might be desired if the LLM is explaining why it can't generate code.
587
+ return llm_response_text # Pass through LLM's direct response
588
+ return llm_response_text # Pass through LLM's error or its non-code (comment-only) response
589
+
590
+ logging.info(f"\n--- Code to Execute (extracted from LLM response): ---\n{code_to_execute}\n----------------------\n")
591
+
592
  from io import StringIO
593
  import sys
594
+ old_stdout = sys.stdout
595
+ sys.stdout = captured_output = StringIO()
596
+
597
+ # Prepare globals for exec. Prefix DataFrames with 'df_' as per prompt.
598
+ exec_globals = {'pd': pd, 'np': np}
599
+ if dataframes_dict:
600
+ for name, df_instance in dataframes_dict.items():
601
+ if isinstance(df_instance, pd.DataFrame):
602
+ exec_globals[f"df_{name}"] = df_instance
603
+ else:
604
+ logging.warning(f"Item '{name}' in dataframes_dict is not a DataFrame. Skipping for exec_globals.")
605
+
606
  try:
607
+ exec(code_to_execute, exec_globals, {}) # Using empty dict for locals
608
  final_output_str = captured_output.getvalue()
609
+
610
+ if not final_output_str.strip():
 
 
611
  logging.info("Code executed successfully, but no explicit print() output was generated by the LLM's code.")
612
+ # Check if the code was just comments or an empty block
613
+ if not any(line.strip() and not line.strip().startswith("#") for line in code_to_execute.splitlines()):
614
+ return "# LLM generated only comments or an empty code block. No output produced."
615
+ return "# Code executed successfully, but it did not produce any printed output. Please ensure the LLM's Python code includes print() statements for the desired results, insights, or answers."
616
  return final_output_str
617
  except Exception as e:
618
+ logging.error(f"Sandbox Execution Error: {e}\nCode was:\n{code_to_execute}", exc_info=True) # Log full traceback for sandbox error
619
+ # Indent the problematic code for better display in the error message
620
+ indented_code = textwrap.indent(code_to_execute, '# ', predicate=lambda line: True)
621
+ return f"# Sandbox Execution Error: {type(e).__name__}: {e}\n# --- Code that caused error: ---\n{indented_code}"
622
+ finally:
623
  sys.stdout = old_stdout
624
+ else: # Not force_sandbox
625
  return llm_response_text
626
 
627
  # --- Employer Branding Agent ---
628
  class EmployerBrandingAgent:
629
+ def __init__(self, llm_model_name: str,
630
+ generation_config_dict: dict,
631
+ safety_settings_list: list,
632
+ all_dataframes: dict,
633
+ rag_documents_df: pd.DataFrame,
634
  embedding_model_name: str,
635
  data_privacy=True, force_sandbox=True):
636
 
637
  self.pandas_llm = PandasLLM(
638
+ llm_model_name,
639
+ generation_config_dict,
640
+ safety_settings_list,
641
+ data_privacy,
642
  force_sandbox
643
  )
644
  self.rag_system = AdvancedRAGSystem(rag_documents_df, embedding_model_name)
645
+ self.all_dataframes = all_dataframes if all_dataframes else {}
646
  self.schemas_representation = get_all_schemas_representation(self.all_dataframes)
647
+ self.chat_history = []
648
+ logging.info("EmployerBrandingAgent Initialized.")
649
 
650
+ def _build_prompt(self, user_query: str, role="Employer Branding Analyst & Strategist", task_decomposition_hint=None, cot_hint=True) -> str:
651
+ # System Instruction part of the prompt (can also be passed to GenerativeModel directly if API supports it well)
652
+ # This initial instruction sets the persona and overall goal.
653
+ prompt = f"You are a highly skilled '{role}'. Your primary goal is to provide actionable employer branding insights and strategic recommendations by analyzing provided data (Pandas DataFrames) and contextual information (RAG documents).\n"
654
  prompt += "You will be provided with schemas for available Pandas DataFrames and a user query.\n"
655
 
656
  if self.pandas_llm.data_privacy:
657
+ prompt += "IMPORTANT: Adhere to data privacy. Do not output raw Personally Identifiable Information (PII) like individual names or specific user contact details. Summarize, aggregate, or anonymize data in your insights.\n"
658
 
659
  if self.pandas_llm.force_sandbox:
660
+ prompt += "\n--- TASK: PYTHON CODE GENERATION FOR INSIGHTS ---\n"
661
+ prompt += "Your main task is to GENERATE PYTHON CODE. This code should use the Pandas library to analyze the provided DataFrames and incorporate insights from any RAG context. The code's `print()` statements MUST output the final textual insights, analyses, or answers to the user's query.\n"
662
+ prompt += "Output ONLY the Python code block, starting with ```python and ending with ```.\n"
663
  prompt += "The available DataFrames are already loaded and can be accessed by their dictionary keys prefixed with 'df_' (e.g., df_follower_stats, df_posts) within the execution environment.\n"
664
  prompt += "Example of accessing a DataFrame: `df_follower_stats['country']`.\n"
665
+
666
+ prompt += "\n--- CRITICAL INSTRUCTIONS FOR PYTHON CODE OUTPUT ---\n"
667
+ prompt += "1. **Print Insights, Not Just Data:** Your Python code's `print()` statements are the agent's final response. These prints MUST articulate clear, actionable insights or answers. Do NOT just print raw DataFrames or intermediate variables unless the query *specifically* asks for a table of data (e.g., 'Show me the first 5 posts').\n"
668
+ prompt += " Example of good insight print: `print(f'Key Insight: Content related to {top_theme} receives {avg_engagement_increase}% higher engagement, suggesting a focus on this area.')`\n"
669
+ prompt += " Example of what to AVOID for insight queries: `print(df_analysis_result)` (unless df_analysis_result is the specific table requested).\n"
670
+ prompt += "2. **Synthesize with RAG Context:** If RAG context is provided, weave takeaways from it into your printed insights. Example: `print(f'Data shows X. Combined with RAG best practice Y, we recommend Z.')`\n"
671
+ prompt += "3. **Structure and Comments:** Write clean, commented Python code. Explain your logic for each step.\n"
672
+ prompt += "4. **Handle Ambiguity/Errors in Code:**\n"
673
+ prompt += " - If the query is ambiguous, `print()` a clarifying question as a string. Do not generate analysis code.\n"
674
+ prompt += " - If the query cannot be answered with the given data/schemas, `print()` a statement explaining this. Example: `print('Insight: Cannot determine X as the required data Y is not available in the provided DataFrames.')`\n"
675
+ prompt += " - For non-analytical queries (e.g., 'hello'), respond politely with a `print()` statement. Example: `print('Hello! How can I assist with your employer branding data analysis today?')`\n"
676
+ prompt += "5. **Function Usage:** If you define functions, ENSURE they are called and their results (or insights derived from them) are `print()`ed.\n"
677
+ prompt += "6. **DataFrame Naming:** Remember to use the `df_` prefix for DataFrame names in your code (e.g., `df_your_data`).\n"
678
+
679
+ else: # Not force_sandbox - LLM provides direct textual answer
680
+ prompt += "\n--- TASK: DIRECT TEXTUAL INSIGHT GENERATION ---\n"
681
+ prompt += "Your task is to analyze the data (described by schemas) and RAG context, then provide a comprehensive textual answer with actionable insights and strategic recommendations. Explain your reasoning step-by-step.\n"
682
 
683
  prompt += "\n--- AVAILABLE DATA AND SCHEMAS ---\n"
684
+ if self.schemas_representation.strip() == "No DataFrames provided.":
685
+ prompt += "No specific DataFrames are currently loaded. Please rely on general knowledge and any provided RAG context for your response, or ask for data to be loaded.\n"
686
+ else:
687
+ prompt += self.schemas_representation
688
 
689
  rag_context = self.rag_system.retrieve_relevant_info(user_query)
690
+ # Check if RAG context is meaningful before appending
691
+ meaningful_rag_keywords = ["Error", "No valid", "No relevant", "Cannot retrieve", "not available", "not generated"]
692
+ is_meaningful_rag = bool(rag_context.strip()) and not any(keyword in rag_context for keyword in meaningful_rag_keywords)
693
+
694
+ if is_meaningful_rag:
695
+ prompt += f"\n--- ADDITIONAL CONTEXT (from Employer Branding Knowledge Base - consider this for your insights) ---\n{rag_context}\n"
696
+ else:
697
+ prompt += "\n--- ADDITIONAL CONTEXT (from Employer Branding Knowledge Base) ---\nNo specific pre-defined context found highly relevant to this query, or RAG system encountered an issue. Rely on general knowledge and DataFrame analysis.\n"
698
+
699
 
700
  prompt += f"\n--- USER QUERY ---\n{user_query}\n"
701
 
 
704
 
705
  if cot_hint:
706
  if self.pandas_llm.force_sandbox:
707
+ prompt += "\n--- THOUGHT PROCESS FOR PYTHON CODE GENERATION (Follow these steps) ---\n"
708
+ prompt += "1. **Understand Query & Goal:** What specific employer branding insight or answer is the user seeking?\n"
709
+ prompt += "2. **Identify Data Sources:** Which DataFrame(s) and column(s) are relevant? Is there RAG context to incorporate?\n"
710
+ prompt += "3. **Plan Analysis (Mental Outline / Code Comments):**\n"
711
+ prompt += " a. What calculations, aggregations, or transformations are needed?\n"
712
+ prompt += " b. How will RAG context be integrated into the final printed insight?\n"
713
+ prompt += " c. What is the exact textual insight/answer to be `print()`ed?\n"
714
+ prompt += "4. **Write Python Code:** Implement the plan. Use `df_name_of_dataframe`.\n"
715
+ prompt += "5. **CRITICAL - Formulate and `print()` Insights:** Construct the final textual insight(s) as strings and use `print()` statements for them. These prints are the agent's entire response. Ensure they are clear, actionable, and directly address the user's query, incorporating RAG if applicable.\n"
716
+ prompt += "6. **Review Code:** Check for correctness, clarity, and adherence to ALL instructions, especially the `print()` requirements for insightful text.\n"
717
+ prompt += "7. **Final Output:** Ensure ONLY the Python code block (```python...```) is generated.\n"
718
+ else: # Not force_sandbox
719
+ prompt += "\n--- THOUGHT PROCESS FOR DIRECT TEXTUAL RESPONSE (Follow these steps) ---\n"
720
+ prompt += "1. **Understand Query & Goal:** What specific employer branding insight or answer is the user seeking?\n"
721
+ prompt += "2. **Identify Data Sources:** Analyze the DataFrame schemas. Consider relevant RAG context.\n"
722
+ prompt += "3. **Formulate Insights:** Synthesize information from data and RAG to derive key insights and recommendations.\n"
723
+ prompt += "4. **Structure Response:** Provide a step-by-step explanation of your analysis, followed by the clear, actionable insights and strategic advice.\n"
724
 
725
  return prompt
726
 
727
+ async def process_query(self, user_query: str, role="Employer Branding Analyst & Strategist", task_decomposition_hint=None, cot_hint=True) -> str:
728
+ # Add user query to history before building prompt, so RAG can use the latest query
729
+ # However, the LLM call itself should get history *excluding* the current query in its history part.
730
+
731
+ current_turn_history_for_llm = self.chat_history[:] # History *before* this turn
732
+
733
+ self.chat_history.append({"role": "user", "parts": [{"text": user_query}]}) # Use 'parts' for Gemini
734
+
735
  full_prompt = self._build_prompt(user_query, role, task_decomposition_hint, cot_hint)
736
 
737
+ # Log only a part of the prompt to avoid overly verbose logs
738
+ # logging.info(f"Full prompt to LLM (showing first 300 and last 300 chars for brevity):\n{full_prompt[:300]}...\n...\n{full_prompt[-300:]}")
739
+ logging.info(f"Built prompt for user query: {user_query[:100]}...")
740
 
741
+
742
+ # Pass the history *before* the current user query to the LLM
743
+ response_text = await self.pandas_llm.query(full_prompt, self.all_dataframes, history=current_turn_history_for_llm)
744
+
745
+ self.chat_history.append({"role": "model", "parts": [{"text": response_text}]}) # Use 'parts' for Gemini
746
 
747
+ MAX_HISTORY_TURNS = 5 # Each turn has a user and a model message
748
  if len(self.chat_history) > MAX_HISTORY_TURNS * 2:
749
+ # Keep the most recent turns. The history is [user1, model1, user2, model2,...]
750
  self.chat_history = self.chat_history[-(MAX_HISTORY_TURNS * 2):]
751
+ logging.info(f"Chat history truncated to last {MAX_HISTORY_TURNS} turns.")
752
 
753
  return response_text
754
 
755
+ def update_dataframes(self, new_dataframes: dict):
756
+ self.all_dataframes = new_dataframes if new_dataframes else {}
757
  self.schemas_representation = get_all_schemas_representation(self.all_dataframes)
758
+ logging.info(f"EmployerBrandingAgent DataFrames updated. New schemas: {self.schemas_representation[:200]}...")
759
+ # Potentially clear RAG embeddings if they depend on the old dataframes, or recompute.
760
+ # For now, RAG is independent of these dataframes.
761
+
762
+ def clear_chat_history(self):
763
  self.chat_history = []
764
  logging.info("EmployerBrandingAgent chat history cleared.")
765
 
766
+ # --- Example Usage (Conceptual - for testing the module structure) ---
767
+ async def main_test():
768
+ logging.info("Starting main_test for EmployerBrandingAgent...")
769
+
770
+ # Dummy DataFrames for testing
771
+ followers_data = {
772
+ 'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-01', '2023-01-03']),
773
+ 'country': ['USA', 'USA', 'Canada', 'UK'],
774
+ 'new_followers': [10, 12, 5, 8]
775
+ }
776
+ df_follower_stats = pd.DataFrame(followers_data)
777
+
778
+ posts_data = {
779
+ 'post_id': [1, 2, 3, 4],
780
+ 'post_date': pd.to_datetime(['2023-01-01', '2023-01-01', '2023-01-02', '2023-01-03']),
781
+ 'theme': ['Culture', 'Tech', 'Culture', 'Jobs'],
782
+ 'impressions': [1000, 1500, 1200, 2000],
783
+ 'engagements': [50, 90, 60, 120]
784
+ }
785
+ df_posts = pd.DataFrame(posts_data)
786
+ df_posts['engagement_rate'] = df_posts['engagements'] / df_posts['impressions']
787
+
788
+ test_dataframes = {
789
+ "follower_stats": df_follower_stats,
790
+ "posts": df_posts,
791
+ "empty_df": pd.DataFrame(), # Test empty df representation
792
+ "non_df_item": "This is not a dataframe" # Test non-df item
793
+ }
794
+
795
+ # Initialize the agent
796
+ # Ensure GEMINI_API_KEY is set in your environment for real calls
797
+ if not GEMINI_API_KEY:
798
+ logging.warning("GEMINI_API_KEY not found in environment. Testing with dummy/mocked functionality.")
799
+
800
+ agent = EmployerBrandingAgent(
801
+ llm_model_name=LLM_MODEL_NAME,
802
+ generation_config_dict=GENERATION_CONFIG_PARAMS,
803
+ safety_settings_list=DEFAULT_SAFETY_SETTINGS,
804
+ all_dataframes=test_dataframes,
805
+ rag_documents_df=df_rag_documents, # Using the example RAG data
806
+ embedding_model_name=GEMINI_EMBEDDING_MODEL_NAME,
807
+ force_sandbox=True # Set to True to test code generation, False for direct LLM text
808
+ )
809
+
810
+ logging.info(f"Schema representation:\n{agent.schemas_representation}")
811
+
812
+ queries = [
813
+ "What are the key trends in follower growth by country based on the first few days of January 2023?",
814
+ "Which post theme has the highest average engagement rate? Provide an insight.",
815
+ "Hello there!",
816
+ "Can you tell me the average salary for software engineers? (This should state data is not available)",
817
+ "Summarize the best practices for attracting tech talent and combine it with an analysis of our top performing post themes."
818
+ ]
819
+
820
+ for query in queries:
821
+ logging.info(f"\n\n--- Processing Query: {query} ---")
822
+ response = await agent.process_query(user_query=query)
823
+ logging.info(f"--- Agent Response for '{query}': ---\n{response}\n---------------------------\n")
824
+ # Small delay if making actual API calls to avoid rate limits during testing
825
+ if GEMINI_API_KEY: await asyncio.sleep(1)
826
+
827
+ # Test updating dataframes
828
+ new_posts_data = {
829
+ 'post_id': [5, 6], 'post_date': pd.to_datetime(['2023-01-04', '2023-01-05']),
830
+ 'theme': ['Innovation', 'Team'], 'impressions': [2500, 1800], 'engagements': [150, 100]
831
+ }
832
+ df_new_posts = pd.DataFrame(new_posts_data)
833
+ df_new_posts['engagement_rate'] = df_new_posts['engagements'] / df_new_posts['impressions']
834
+
835
+ updated_dataframes = {
836
+ "follower_stats": df_follower_stats, # unchanged
837
+ "posts": pd.concat([df_posts, df_new_posts]), # updated
838
+ "company_values": pd.DataFrame({'value': ['Innovation', 'Collaboration'], 'description': ['...', '...']}) # new df
839
+ }
840
+ agent.update_dataframes(updated_dataframes)
841
+ logging.info(f"\n--- Processing Query after DataFrame Update ---")
842
+ response_after_update = await agent.process_query("What's the latest top performing post theme now?")
843
+ logging.info(f"--- Agent Response for 'What's the latest top performing post theme now?': ---\n{response_after_update}\n---------------------------\n")
844
+
845
+
846
+ if __name__ == "__main__":
847
+ # This allows running the test if the script is executed directly
848
+ # Note: For real API calls, ensure GEMINI_API_KEY is set in your environment.
849
+ # Example: export GEMINI_API_KEY="your_api_key_here"
850
+
851
+ # To run the async main_test:
852
+ # asyncio.run(main_test())
853
+ # Or, if you're in a Jupyter environment that has its own loop:
854
+ # await main_test()
855
+
856
+ # For simplicity in a standard Python script:
857
+ if GEMINI_API_KEY: # Only run full async test if API key likely present
858
+ try:
859
+ asyncio.run(main_test())
860
+ except RuntimeError as e:
861
+ if " asyncio.run() cannot be called from a running event loop" in str(e):
862
+ print("Skipping asyncio.run(main_test()) as it seems to be in an existing event loop (e.g., Jupyter). Call 'await main_test()' instead if appropriate.")
863
+ else:
864
+ raise
865
+ else:
866
+ print("GEMINI_API_KEY not set. Skipping main_test() which might make real API calls. The module can be imported and used elsewhere.")