GuglielmoTor commited on
Commit
e03d275
·
verified ·
1 Parent(s): c679528

Create eb_agent_module.py

Browse files
Files changed (1) hide show
  1. eb_agent_module.py +545 -0
eb_agent_module.py ADDED
@@ -0,0 +1,545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # eb_agent_module.py
2
+ import pandas as pd
3
+ import json
4
+ import os
5
+ import asyncio
6
+ import logging
7
+ import numpy as np
8
+ import textwrap
9
+
10
+ # Attempt to import Google Generative AI and related types
11
+ try:
12
+ from google import genai
13
+ from google.genai import types as genai_types
14
+ # from google.api_core import retry_async # For async retries if needed
15
+ except ImportError:
16
+ print("Google Generative AI library not found. Please install it: pip install google-generativeai")
17
+ # Define dummy classes/functions if the import fails, to allow the rest of the script to be parsed
18
+ class genai: # type: ignore
19
+ @staticmethod
20
+ def configure(api_key): pass
21
+ @staticmethod
22
+ def GenerativeModel(model_name): return None # type: ignore
23
+ @staticmethod
24
+ def embed_content(model, content, task_type, title=None): return {"embedding": [0.1] * 768} # type: ignore
25
+
26
+ class genai_types: # type: ignore
27
+ @staticmethod
28
+ def GenerateContentConfig(**kwargs): return None # type: ignore
29
+ class BlockReason: # type: ignore
30
+ SAFETY = "SAFETY"
31
+ class HarmCategory: # type: ignore
32
+ HARM_CATEGORY_UNSPECIFIED = "HARM_CATEGORY_UNSPECIFIED"
33
+ HARM_CATEGORY_HARASSMENT = "HARM_CATEGORY_HARASSMENT"
34
+ HARM_CATEGORY_HATE_SPEECH = "HARM_CATEGORY_HATE_SPEECH"
35
+ HARM_CATEGORY_SEXUALLY_EXPLICIT = "HARM_CATEGORY_SEXUALLY_EXPLICIT"
36
+ HARM_CATEGORY_DANGEROUS_CONTENT = "HARM_CATEGORY_DANGEROUS_CONTENT"
37
+ class HarmBlockThreshold:
38
+ BLOCK_NONE = "BLOCK_NONE"
39
+
40
+
41
+ # --- Configuration ---
42
+ GEMINI_API_KEY = os.getenv('GEMINI_API_KEY', "") # Use your environment variable
43
+ LLM_MODEL_NAME = "gemini-1.5-flash-latest"
44
+ GEMINI_EMBEDDING_MODEL_NAME = "models/embedding-001" # Standard embedding model
45
+
46
+ # Generation configuration for the LLM
47
+ GENERATION_CONFIG_PARAMS = {
48
+ "temperature": 0.2,
49
+ "top_p": 1.0,
50
+ "top_k": 32,
51
+ "max_output_tokens": 4096, # Increased for potentially longer code/explanations
52
+ }
53
+
54
+ # Safety settings for Gemini
55
+ DEFAULT_SAFETY_SETTINGS = {
56
+ genai_types.HarmCategory.HARM_CATEGORY_HARASSMENT: genai_types.HarmBlockThreshold.BLOCK_NONE,
57
+ genai_types.HarmCategory.HARM_CATEGORY_HATE_SPEECH: genai_types.HarmBlockThreshold.BLOCK_NONE,
58
+ genai_types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: genai_types.HarmBlockThreshold.BLOCK_NONE,
59
+ genai_types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: genai_types.HarmBlockThreshold.BLOCK_NONE,
60
+ }
61
+
62
+
63
+ # Logging setup
64
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(module)s - %(message)s')
65
+
66
+ # Initialize Gemini Client globally if API key is available
67
+ if GEMINI_API_KEY:
68
+ try:
69
+ genai.configure(api_key=GEMINI_API_KEY)
70
+ logging.info(f"Gemini API key configured. Target model for generation: '{LLM_MODEL_NAME}', Embedding model: '{GEMINI_EMBEDDING_MODEL_NAME}'")
71
+ except Exception as e:
72
+ logging.error(f"Failed to configure Gemini API: {e}", exc_info=True)
73
+ else:
74
+ logging.warning("GEMINI_API_KEY environment variable not set. LLM and Embedding functionalities will be limited.")
75
+
76
+
77
+ # --- RAG Documents Definition (as in your example) ---
78
+ # This will be used by the AdvancedRAGSystem.
79
+ # You can replace this with more relevant documents for your LinkedIn dashboard context if needed.
80
+ rag_documents_data = {
81
+ 'Title': [
82
+ "Employer Branding Best Practices 2024",
83
+ "Attracting Tech Talent in Competitive Markets",
84
+ "Understanding Company Culture for Talent Acquisition",
85
+ "Diversity and Inclusion in Modern Hiring Processes",
86
+ "Leveraging LinkedIn Data for Recruitment Insights",
87
+ "Analyzing Employee Engagement Metrics",
88
+ "Content Strategies for LinkedIn Company Pages"
89
+ ],
90
+ 'Text': [
91
+ "Focus on authentic employee stories and showcase your company's mission. Transparency in compensation and benefits is key. Leverage social media, especially LinkedIn, to highlight your work environment and values. Regularly share updates about company achievements and employee successes.",
92
+ "Tech candidates value challenging projects, opportunities for learning new technologies, and a flexible work culture. Highlight your tech stack, innovation efforts, and career development paths. Competitive salaries and benefits are standard expectations.",
93
+ "Company culture is defined by shared values, beliefs, and behaviors. It's crucial for attracting and retaining talent that aligns with the organization. Assess culture through employee surveys, feedback sessions, and by observing daily interactions. Promote a positive culture actively.",
94
+ "Promote diversity and inclusion by using inclusive language in job descriptions, ensuring diverse interview panels, and highlighting D&I initiatives. Track diversity metrics and be transparent about your goals and progress. An inclusive culture boosts innovation.",
95
+ "LinkedIn data provides rich insights into talent pools, competitor strategies, and industry trends. Analyze follower demographics, content engagement, and employee advocacy to refine your employer branding and recruitment efforts. Use LinkedIn Analytics effectively.",
96
+ "High employee engagement correlates with better retention and productivity. Key metrics include employee Net Promoter Score (eNPS), satisfaction surveys, and participation in company initiatives. Address feedback promptly to foster a positive work environment.",
97
+ "Develop a content calendar for your LinkedIn Company Page that includes a mix of thought leadership, company news, employee spotlights, job postings, and industry insights. Use visuals and videos to increase engagement. Analyze post performance to optimize your strategy."
98
+ ]
99
+ }
100
+ df_rag_documents = pd.DataFrame(rag_documents_data)
101
+
102
+
103
+ # --- Schema Representation ---
104
+ def get_schema_representation(df_name: str, df: pd.DataFrame) -> str:
105
+ """Generates a string representation of a DataFrame's schema and a sample of its data."""
106
+ if df.empty:
107
+ return f"Schema for DataFrame '{df_name}':\n - DataFrame is empty.\n"
108
+
109
+ cols = df.columns.tolist()
110
+ dtypes = df.dtypes.to_dict()
111
+ schema_str = f"Schema for DataFrame 'df_{df_name}':\n" # Note: using df_ prefix for LLM
112
+ for col in cols:
113
+ schema_str += f" - Column '{col}': {dtypes[col]}\n"
114
+
115
+ # Add notes for complex data types or common pitfalls
116
+ for col in cols:
117
+ if 'date' in col.lower() or 'time' in col.lower():
118
+ schema_str += f" - Note: Column '{col}' seems to be date/time related. Ensure it is in datetime format for time-series analysis (e.g., using pd.to_datetime(df_{df_name}['{col}'])).\n"
119
+ if df[col].apply(type).eq(list).any() or df[col].apply(type).eq(dict).any():
120
+ schema_str += f" - Note: Column '{col}' may contain list-like or dict-like data (e.g., skills: ['Python', 'SQL']). Use operations like .explode() or .apply(pd.Series) or .apply(lambda x: ...) for lists/dicts.\n"
121
+ if df[col].dtype == 'object' and df[col].nunique() < 20 and df.shape[0] > 20: # Potential categorical
122
+ schema_str += f" - Note: Column '{col}' is of type object and has few unique values; it might be categorical. Use .value_counts() for distribution.\n"
123
+
124
+ schema_str += f"Sample of first 2 rows of 'df_{df_name}':\n{df.head(2).to_string()}\n"
125
+ return schema_str
126
+
127
+ def get_all_schemas_representation(dataframes_dict: dict) -> str:
128
+ """Generates a combined string representation of schemas for all DataFrames."""
129
+ full_schema_str = "You have access to the following Pandas DataFrames. In your Python code, refer to them with the 'df_' prefix (e.g., df_follower_stats, df_posts).\n\n"
130
+ for name, df_instance in dataframes_dict.items():
131
+ full_schema_str += get_schema_representation(name, df_instance) + "\n"
132
+ return full_schema_str
133
+
134
+
135
+ # --- Advanced RAG System ---
136
+ class AdvancedRAGSystem:
137
+ def __init__(self, documents_df: pd.DataFrame, embedding_model_name: str):
138
+ if not GEMINI_API_KEY:
139
+ logging.warning("RAG System: GEMINI_API_KEY not set. Embeddings will not be generated.")
140
+ self.documents_df = documents_df.copy() # Still keep the df for potential non-embedding use
141
+ if 'Embeddings' not in self.documents_df.columns:
142
+ self.documents_df['Embeddings'] = pd.Series(dtype='object')
143
+ self.embedding_model_name = embedding_model_name
144
+ self.embeddings_generated = False
145
+ return
146
+
147
+ self.documents_df = documents_df.copy()
148
+ self.embedding_model_name = embedding_model_name
149
+ self.embeddings_generated = False
150
+ try:
151
+ self._precompute_embeddings()
152
+ self.embeddings_generated = True
153
+ logging.info("AdvancedRAGSystem Initialized and embeddings precomputed.")
154
+ except Exception as e:
155
+ logging.error(f"Error during RAG embedding precomputation: {e}", exc_info=True)
156
+ if 'Embeddings' not in self.documents_df.columns:
157
+ self.documents_df['Embeddings'] = pd.Series(dtype='object')
158
+
159
+
160
+ def _embed_fn(self, title: str, text: str) -> list[float]:
161
+ try:
162
+ if not hasattr(genai, 'embed_content') or not self.embeddings_generated: # Check if genai is usable
163
+ logging.warning("genai.embed_content not available or embeddings not generated. Returning zero vector.")
164
+ return [0.0] * 768
165
+
166
+ embedding_result = genai.embed_content(
167
+ model=self.embedding_model_name,
168
+ content=text,
169
+ task_type="retrieval_document",
170
+ title=title
171
+ )
172
+ return embedding_result["embedding"]
173
+ except Exception as e:
174
+ logging.error(f"Error embedding content '{title}': {e}", exc_info=True)
175
+ return [0.0] * 768
176
+
177
+ def _precompute_embeddings(self):
178
+ if 'Embeddings' not in self.documents_df.columns:
179
+ self.documents_df['Embeddings'] = pd.Series(dtype='object')
180
+
181
+ # Only compute for rows where 'Embeddings' is None, not a list, or a zero vector
182
+ for index, row in self.documents_df.iterrows():
183
+ current_embedding = row['Embeddings']
184
+ is_valid_embedding = isinstance(current_embedding, list) and len(current_embedding) > 0 and sum(abs(x) for x in current_embedding) > 1e-6 # Check if not zero vector
185
+
186
+ if not is_valid_embedding:
187
+ self.documents_df.at[index, 'Embeddings'] = self._embed_fn(row['Title'], row['Text'])
188
+ logging.info("Embeddings precomputation finished.")
189
+
190
+
191
+ def retrieve_relevant_info(self, query_text: str, top_k: int = 2) -> str:
192
+ if not self.embeddings_generated or not hasattr(genai, 'embed_content') or 'Embeddings' not in self.documents_df.columns or self.documents_df['Embeddings'].isnull().all():
193
+ logging.warning("RAG System: Cannot retrieve info due to missing API key, genai functions, or empty/missing embeddings.")
194
+ return "\n[RAG Context]\nNo specific pre-defined context found (RAG system inactive or no embeddings).\n"
195
+
196
+ try:
197
+ query_embedding_result = genai.embed_content(
198
+ model=self.embedding_model_name,
199
+ content=query_text,
200
+ task_type="retrieval_query"
201
+ )
202
+ query_embedding = np.array(query_embedding_result["embedding"])
203
+
204
+ valid_embeddings_df = self.documents_df.dropna(subset=['Embeddings'])
205
+ valid_embeddings_df = valid_embeddings_df[valid_embeddings_df['Embeddings'].apply(lambda x: isinstance(x, list) and len(x) > 0 and sum(abs(val) for val in x) > 1e-6)]
206
+
207
+
208
+ if valid_embeddings_df.empty:
209
+ logging.warning("No valid document embeddings found for RAG.")
210
+ return "\n[RAG Context]\nNo valid document embeddings available for retrieval.\n"
211
+
212
+ document_embeddings = np.stack(valid_embeddings_df['Embeddings'].apply(np.array).values)
213
+
214
+ if query_embedding.shape[0] != document_embeddings.shape[1]:
215
+ logging.error(f"Query embedding dim ({query_embedding.shape[0]}) != Document embedding dim ({document_embeddings.shape[1]})")
216
+ return "\n[RAG Context]\nEmbedding dimension mismatch.\n"
217
+
218
+ dot_products = np.dot(document_embeddings, query_embedding)
219
+
220
+ # Get top_k indices, ensure top_k is not greater than available docs
221
+ num_available_docs = len(valid_embeddings_df)
222
+ actual_top_k = min(top_k, num_available_docs)
223
+
224
+ if actual_top_k == 0:
225
+ return "\n[RAG Context]\nNo documents to retrieve from.\n"
226
+
227
+ if actual_top_k == 1 and num_available_docs > 0:
228
+ idx = [np.argmax(dot_products)]
229
+ elif num_available_docs > 0 :
230
+ idx = np.argsort(dot_products)[-actual_top_k:][::-1]
231
+ else: # Should not happen if actual_top_k > 0
232
+ idx = []
233
+
234
+
235
+ relevant_passages = ""
236
+ for i in idx:
237
+ passage_title = valid_embeddings_df.iloc[i]['Title']
238
+ passage_text = valid_embeddings_df.iloc[i]['Text']
239
+ relevant_passages += f"\n[RAG Context from: '{passage_title}']\n{passage_text}\n"
240
+
241
+ logging.info(f"RAG System retrieved: {relevant_passages[:200]}...")
242
+ return relevant_passages if relevant_passages else "\n[RAG Context]\nNo highly relevant passages found.\n"
243
+
244
+ except Exception as e:
245
+ logging.error(f"Error retrieving relevant info from RAG: {e}", exc_info=True)
246
+ return f"\n[RAG Context]\nError during RAG retrieval: {str(e)}\n"
247
+
248
+
249
+ # --- PandasLLM Class (Gemini-Powered) ---
250
+ class PandasLLM:
251
+ def __init__(self, llm_model_name: str, generation_config_params: dict, safety_settings: dict, data_privacy=True, force_sandbox=True):
252
+ self.llm_model_name = llm_model_name
253
+ self.generation_config_params = generation_config_params
254
+ self.safety_settings = safety_settings
255
+ self.data_privacy = data_privacy
256
+ self.force_sandbox = force_sandbox # If True, LLM must output Python code to be exec'd
257
+
258
+ if not GEMINI_API_KEY:
259
+ logging.warning("PandasLLM: GEMINI_API_KEY not set. LLM functionalities will be limited.")
260
+ self.model = None
261
+ else:
262
+ try:
263
+ self.model = genai.GenerativeModel(
264
+ self.llm_model_name,
265
+ safety_settings=self.safety_settings
266
+ )
267
+ logging.info(f"PandasLLM Initialized with Gemini model '{self.llm_model_name}'. data_privacy={data_privacy}, force_sandbox={force_sandbox}")
268
+ except Exception as e:
269
+ logging.error(f"Failed to initialize GenerativeModel '{self.llm_model_name}': {e}", exc_info=True)
270
+ self.model = None
271
+
272
+ async def _call_gemini_api_async(self, prompt_text: str, history: list = None) -> str:
273
+ if not self.model:
274
+ logging.error("PandasLLM: Gemini model not initialized. Cannot call API.")
275
+ return "# Error: Gemini model not available. Check API key and configuration."
276
+
277
+ # Construct content for Gemini API
278
+ # The new API expects a list of Content objects, or a list of dicts
279
+ # For chat-like interaction, history should be managed.
280
+ # For single-turn code generation, a simple user prompt might suffice.
281
+
282
+ # For now, let's assume single turn for code generation for simplicity in PandasLLM context
283
+ # If this were a conversational agent, history would be crucial.
284
+ contents_for_api = [{"role": "user", "parts": [{"text": prompt_text}]}]
285
+ if history: # If history is provided, prepend it
286
+ # Ensure history is in the correct format [{'role':'user/model', 'parts':[{'text':...}]}]
287
+ # This part might need adjustment based on how history is structured by the calling agent
288
+ formatted_history = []
289
+ for entry in history:
290
+ role = entry.get("role", "user") # Default to user if role not specified
291
+ if role == "assistant": role = "model" # Gemini uses "model" for assistant
292
+ formatted_history.append({"role": role, "parts": [{"text": entry.get("content", "")}]})
293
+ contents_for_api = formatted_history + contents_for_api
294
+
295
+
296
+ try:
297
+ gen_config_obj = genai_types.GenerateContentConfig(**self.generation_config_params)
298
+ except Exception as e:
299
+ logging.error(f"Error creating GenerateContentConfig: {e}. Using dict directly.")
300
+ gen_config_obj = self.generation_config_params
301
+
302
+ logging.info(f"\n--- Calling Gemini API with prompt (first 500 chars of last message): ---\n{contents_for_api[-1]['parts'][0]['text'][:500]}...\n-------------------------------------------------------\n")
303
+
304
+ try:
305
+ # Using asyncio.to_thread for the blocking SDK call
306
+ response = await asyncio.to_thread(
307
+ self.model.generate_content,
308
+ contents=contents_for_api, # Pass the constructed content
309
+ generation_config=gen_config_obj,
310
+ # stream=False # Ensure non-streaming for this setup
311
+ )
312
+
313
+ if response.prompt_feedback and response.prompt_feedback.block_reason:
314
+ reason = response.prompt_feedback.block_reason
315
+ reason_name = getattr(reason, 'name', str(reason)) # Handle if reason is enum or string
316
+ logging.warning(f"Gemini API call blocked by prompt feedback: {reason_name}")
317
+ return f"# Error: Prompt blocked due to content policy: {reason_name}."
318
+
319
+ # Try to extract text, accounting for different response structures
320
+ llm_output = ""
321
+ if hasattr(response, 'text') and response.text:
322
+ llm_output = response.text
323
+ elif response.candidates:
324
+ candidate = response.candidates[0]
325
+ if candidate.content and candidate.content.parts:
326
+ llm_output = "".join(part.text for part in candidate.content.parts if hasattr(part, 'text'))
327
+
328
+ # Check finish reason if output is empty
329
+ if not llm_output:
330
+ finish_reason_val = candidate.finish_reason
331
+ finish_reason = getattr(finish_reason_val, 'name', str(finish_reason_val)) # Handle enum or string
332
+ logging.warning(f"No text content in response candidate. Finish reason: {finish_reason}")
333
+ if finish_reason == "SAFETY": # Check against genai_types.FinishReason.SAFETY if available
334
+ return f"# Error: Response generation stopped due to safety reasons ({finish_reason})."
335
+ elif finish_reason == "RECITATION":
336
+ return f"# Error: Response generation stopped due to recitation policy ({finish_reason})."
337
+ return f"# Error: The AI model returned an empty response. Finish reason: {finish_reason}."
338
+ else:
339
+ logging.warning("Gemini API response structure not recognized or empty.")
340
+ return "# Error: The AI model returned an unexpected or empty response structure."
341
+
342
+ logging.info(f"--- Gemini API Response (first 300 chars): ---\n{llm_output[:300]}...\n--------------------------------------------------\n")
343
+ return llm_output
344
+
345
+ except AttributeError as ae: # Catch issues with dummy genai objects if API key missing
346
+ logging.error(f"AttributeError during Gemini call (likely due to missing API key/dummy objects): {ae}", exc_info=True)
347
+ return f"# Error (Attribute): {type(ae).__name__} - {ae}. Check if GEMINI_API_KEY is set and google.genai library is correctly installed."
348
+ except Exception as e:
349
+ logging.error(f"Error calling Gemini API: {e}", exc_info=True)
350
+ # More specific error messages
351
+ if "API_KEY_INVALID" in str(e) or "API key not valid" in str(e):
352
+ return "# Error: Gemini API key is not valid. Please check your GEMINI_API_KEY environment variable."
353
+ if "400" in str(e) and "model" in str(e).lower() and ("not found" in str(e).lower() or "does not exist" in str(e).lower()):
354
+ return f"# Error: Gemini Model '{self.llm_model_name}' not found or not accessible with your API key. Check model name and permissions."
355
+ if "DeadlineExceeded" in str(e) or "504" in str(e):
356
+ return "# Error: The request to Gemini API timed out. Please try again later."
357
+ if "PermissionDenied" in str(e) or "403" in str(e):
358
+ return "# Error: Permission denied. Check if your API key has access to the model or required services."
359
+ return f"# Error: An unexpected error occurred while contacting the AI model: {type(e).__name__} - {str(e)[:100]}..."
360
+
361
+
362
+ async def query(self, prompt_with_query_and_context: str, dataframes_dict: dict, history: list = None) -> str:
363
+ """
364
+ Sends a prompt to the LLM and optionally executes the returned Python code in a sandbox.
365
+ dataframes_dict: Keys are 'base_name' (e.g., 'profiles'), values are pd.DataFrame.
366
+ In exec, they are available as 'df_base_name'.
367
+ history: Optional chat history for conversational context.
368
+ """
369
+ llm_response_text = await self._call_gemini_api_async(prompt_with_query_and_context, history)
370
+
371
+ if self.force_sandbox:
372
+ # Attempt to extract Python code block
373
+ code_to_execute = ""
374
+ if "```python" in llm_response_text:
375
+ try:
376
+ code_to_execute = llm_response_text.split("```python\n", 1)[1].split("\n```", 1)[0]
377
+ except IndexError:
378
+ # This might happen if the format is slightly off, e.g. no newline after ```python
379
+ try:
380
+ code_to_execute = llm_response_text.split("```python", 1)[1].split("```", 1)[0]
381
+ if code_to_execute.startswith("\n"): code_to_execute = code_to_execute[1:] # remove leading newline
382
+ if code_to_execute.endswith("\n"): code_to_execute = code_to_execute[:-1] # remove trailing newline
383
+
384
+ except IndexError:
385
+ code_to_execute = "" # Fallback, code not extracted
386
+ logging.warning("Could not extract Python code using primary or secondary split method.")
387
+
388
+ if llm_response_text.startswith("# Error:") or not code_to_execute:
389
+ error_prefix = "LLM did not return a valid Python code block or an error occurred."
390
+ if llm_response_text.startswith("# Error:"): error_prefix = "An error occurred during LLM call."
391
+ elif not code_to_execute: error_prefix = "Could not extract Python code from LLM response."
392
+
393
+ # Sanitize llm_response_text before printing to avoid breaking f-string or print
394
+ safe_llm_response = str(llm_response_text).replace("'''", "'").replace('"""', '"')
395
+ code_for_error_display = f"print(f'''{error_prefix}\\nRaw LLM Response (may be truncated):\\n{safe_llm_response[:1000]}''')"
396
+ logging.warning(f"Problem with LLM response for sandbox: {error_prefix}")
397
+ # Fallback to printing the raw response or error
398
+ llm_response_text_for_sandbox_error = code_for_error_display
399
+
400
+
401
+ logging.info(f"\n--- Code to Execute (from LLM, if sandbox): ---\n{code_to_execute}\n------------------------------------------------\n")
402
+
403
+ # Define a restricted set of built-ins
404
+ safe_builtins = {name: obj for name, obj in __builtins__.__dict__.items() if not name.startswith('_')}
405
+ # More aggressive removal (example, adjust as needed for security)
406
+ # For a web app, this sandboxing is CRITICAL.
407
+ # Consider using a dedicated sandboxing library if security is paramount.
408
+ unsafe_builtins = ['eval', 'exec', 'open', 'compile', 'input', 'memoryview', 'vars', 'globals', 'locals', '__import__']
409
+ for ub in unsafe_builtins:
410
+ safe_builtins.pop(ub, None)
411
+
412
+ # Prepare globals for exec: pandas, numpy, dataframes, and restricted builtins
413
+ exec_globals = {'pd': pd, 'np': np, '__builtins__': safe_builtins}
414
+ for name, df_instance in dataframes_dict.items():
415
+ exec_globals[f"df_{name}"] = df_instance # e.g. df_follower_stats, df_posts
416
+
417
+ from io import StringIO
418
+ import sys
419
+ old_stdout = sys.stdout
420
+ sys.stdout = captured_output = StringIO()
421
+
422
+ final_output_str = ""
423
+ try:
424
+ if code_to_execute: # Only execute if code was extracted
425
+ exec(code_to_execute, exec_globals, {}) # Empty locals
426
+ output_val = captured_output.getvalue()
427
+ final_output_str = output_val if output_val else "# Code executed successfully, but no explicit print() output was generated by the code."
428
+ logging.info(f"--- Sandbox Execution Output: ---\n{final_output_str}\n-------------------------\n")
429
+ else: # No code to execute, use the prepared error message
430
+ exec(llm_response_text_for_sandbox_error, exec_globals, {})
431
+ final_output_str = captured_output.getvalue()
432
+ logging.warning(f"--- Sandbox Fallback Output (No Code Executed): ---\n{final_output_str}\n-------------------------\n")
433
+
434
+ except Exception as e:
435
+ error_msg = f"# Error executing LLM-generated code:\n# {type(e).__name__}: {str(e)}\n# --- Code that caused error: ---\n{textwrap.indent(code_to_execute, '# ')}"
436
+ final_output_str = error_msg
437
+ logging.error(error_msg, exc_info=False) # exc_info=False to avoid huge traceback in Gradio UI
438
+ finally:
439
+ sys.stdout = old_stdout # Reset stdout
440
+ return final_output_str
441
+ else: # Not force_sandbox, return LLM text directly
442
+ return llm_response_text
443
+
444
+
445
+ # --- Employer Branding Agent ---
446
+ class EmployerBrandingAgent:
447
+ def __init__(self, llm_model_name: str, generation_config_params: dict, safety_settings: dict,
448
+ all_dataframes: dict, rag_documents_df: pd.DataFrame, embedding_model_name: str,
449
+ data_privacy=True, force_sandbox=True):
450
+ self.pandas_llm = PandasLLM(llm_model_name, generation_config_params, safety_settings, data_privacy, force_sandbox)
451
+ self.rag_system = AdvancedRAGSystem(rag_documents_df, embedding_model_name)
452
+ self.all_dataframes = all_dataframes # Keys are 'base_name', values are pd.DataFrame
453
+ self.schemas_representation = get_all_schemas_representation(self.all_dataframes)
454
+ self.chat_history = [] # Stores conversation history for this agent instance
455
+ logging.info("EmployerBrandingAgent Initialized.")
456
+
457
+ def _build_prompt(self, user_query: str, role="Employer Branding Analyst", task_decomposition_hint=None, cot_hint=True) -> str:
458
+ # Base prompt
459
+ prompt = f"You are a helpful and expert '{role}'. Your primary goal is to assist with analyzing LinkedIn-related data using Pandas DataFrames.\n"
460
+ prompt += "You will be provided with schemas for available Pandas DataFrames and a user query.\n"
461
+
462
+ if self.pandas_llm.data_privacy:
463
+ prompt += "IMPORTANT: Be mindful of data privacy. Do not output raw Personally Identifiable Information (PII) like names or specific user details unless explicitly asked and absolutely necessary for the query. Summarize or aggregate data where possible.\n"
464
+
465
+ if self.pandas_llm.force_sandbox:
466
+ prompt += "Your main task is to GENERATE PYTHON CODE using the Pandas library to answer the user query based on the provided DataFrames. Output ONLY the Python code block.\n"
467
+ prompt += "The available DataFrames are already loaded and can be accessed by their dictionary keys prefixed with 'df_' (e.g., df_follower_stats, df_posts) within the execution environment.\n"
468
+ prompt += "Example of accessing a DataFrame: `df_follower_stats['country']`.\n"
469
+ prompt += "Your Python code MUST include `print()` statements for any results, DataFrames, or values you want to display. The output of these print statements will be the final answer.\n"
470
+ prompt += "If a column contains lists (e.g., 'skills' in a hypothetical 'df_employees'), you might need to use methods like `.explode()` or `.apply(pd.Series)` or `.apply(lambda x: ...)` for analysis.\n"
471
+ prompt += "If the query is ambiguous or requires clarification, ask for it instead of making assumptions. If the query cannot be answered with the given data, state that clearly.\n"
472
+ prompt += "If the query is not about data analysis or code generation (e.g. 'hello', 'how are you?'), respond politely and briefly, do not attempt to generate code.\n"
473
+ prompt += "Structure your code clearly. Add comments (#) to explain each step of your logic.\n"
474
+ else: # Textual response mode
475
+ prompt += "Your task is to analyze the data and provide a comprehensive textual answer to the user query. You can explain your reasoning step-by-step.\n"
476
+
477
+ prompt += "\n--- AVAILABLE DATA AND SCHEMAS ---\n"
478
+ prompt += self.schemas_representation
479
+
480
+ # RAG Context (only add if relevant context is found)
481
+ rag_context = self.rag_system.retrieve_relevant_info(user_query)
482
+ if rag_context and "[RAG Context]" in rag_context and "No specific pre-defined context found" not in rag_context and "No highly relevant passages found" not in rag_context:
483
+ prompt += f"\n--- ADDITIONAL CONTEXT (from internal knowledge base, consider this information) ---\n{rag_context}\n"
484
+
485
+ prompt += f"\n--- USER QUERY ---\n{user_query}\n"
486
+
487
+ if task_decomposition_hint:
488
+ prompt += f"\n--- GUIDANCE FOR ANALYSIS (Task Decomposition) ---\n{task_decomposition_hint}\n"
489
+
490
+ if cot_hint:
491
+ if self.pandas_llm.force_sandbox:
492
+ prompt += "\n--- INSTRUCTIONS FOR PYTHON CODE GENERATION (Chain of Thought) ---\n"
493
+ prompt += "1. Understand the query: What specific information is requested?\n"
494
+ prompt += "2. Identify relevant DataFrame(s) and column(s) from the schemas provided.\n"
495
+ prompt += "3. Plan the steps: Outline the Pandas operations needed (filtering, grouping, aggregation, merging, etc.) as comments in your code.\n"
496
+ prompt += "4. Write the code: Implement the steps using Pandas. Remember to use `df_name_of_dataframe` (e.g. `df_follower_stats`).\n"
497
+ prompt += "5. Ensure output: Use `print()` for all results that should be displayed. For DataFrames, you can print the DataFrame directly, or `df.to_string()` if it's large.\n"
498
+ prompt += "6. Review: Check for correctness, efficiency, and adherence to the prompt (especially the `print()` requirement).\n"
499
+ prompt += "7. Generate ONLY the Python code block starting with ```python and ending with ```. No explanations outside the code block's comments.\n"
500
+ else: # Textual CoT
501
+ prompt += "\n--- INSTRUCTIONS FOR RESPONSE (Chain of Thought) ---\n"
502
+ prompt += "1. Understand the query fully.\n"
503
+ prompt += "2. Identify the relevant data sources (DataFrames and columns).\n"
504
+ prompt += "3. Explain your analytical approach step-by-step.\n"
505
+ prompt += "4. Perform the analysis (mentally or by outlining the steps).\n"
506
+ prompt += "5. Present the findings clearly and concisely. If you performed calculations, show or describe them.\n"
507
+ prompt += "6. If applicable, incorporate insights from the 'ADDITIONAL CONTEXT' (RAG system).\n"
508
+
509
+ return prompt
510
+
511
+ async def process_query(self, user_query: str, role="Employer Branding Analyst", task_decomposition_hint=None, cot_hint=True) -> str:
512
+ logging.info(f"\n=== Processing Query for Role: {role} ===")
513
+ logging.info(f"User Query: {user_query}")
514
+
515
+ # Add user query to chat history
516
+ self.chat_history.append({"role": "user", "content": user_query})
517
+
518
+ full_prompt = self._build_prompt(user_query, role, task_decomposition_hint, cot_hint)
519
+
520
+ # Pass relevant parts of chat history to pandas_llm.query if needed for context
521
+ # For now, PandasLLM's _call_gemini_api_async is set up for single turn for code gen,
522
+ # but can be adapted if conversational context for code gen becomes important.
523
+ # The full_prompt itself is rebuilt each time, incorporating the latest user_query.
524
+ response_text = await self.pandas_llm.query(full_prompt, self.all_dataframes, history=self.chat_history[:-1]) # Pass history excluding current query
525
+
526
+ # Add assistant response to chat history
527
+ self.chat_history.append({"role": "assistant", "content": response_text})
528
+
529
+ # Limit history size to avoid overly long prompts in future turns (e.g., last 10 messages)
530
+ MAX_HISTORY_TURNS = 5 # 5 pairs of user/assistant messages
531
+ if len(self.chat_history) > MAX_HISTORY_TURNS * 2:
532
+ self.chat_history = self.chat_history[-(MAX_HISTORY_TURNS * 2):]
533
+
534
+ return response_text
535
+
536
+ def update_dataframes(self, new_dataframes: dict):
537
+ """Updates the agent's DataFrames and their schema representation."""
538
+ self.all_dataframes = new_dataframes
539
+ self.schemas_representation = get_all_schemas_representation(self.all_dataframes)
540
+ logging.info("EmployerBrandingAgent DataFrames updated.")
541
+
542
+ def clear_chat_history(self):
543
+ self.chat_history = []
544
+ logging.info("EmployerBrandingAgent chat history cleared.")
545
+