Spaces:

GuglielmoTor
/

LinkedinMonitor

Running

App Files Files Community

GuglielmoTor commited on May 27

Commit

09757d6

verified ·

1 Parent(s): 25e22b8

Update eb_agent_module.py

Browse files

Files changed (1) hide show

eb_agent_module.py +4 -216

eb_agent_module.py CHANGED Viewed

@@ -80,18 +80,6 @@ GENERATION_CONFIG_PARAMS = {
 DEFAULT_SAFETY_SETTINGS = []
-# Default RAG documents
-DEFAULT_RAG_DOCUMENTS = pd.DataFrame({
-    'text': [
-        "Employer branding focuses on how an organization is perceived as an employer by potential and current employees.",
-        "Key metrics for employer branding include employee engagement, candidate quality, and retention rates.",
-        "LinkedIn is a crucial platform for showcasing company culture and attracting talent.",
-        "Analyzing follower demographics and post engagement helps refine employer branding strategies.",
-        "Content strategy should align with company values to attract the right talent.",
-        "Employee advocacy programs can significantly boost employer brand reach and authenticity."
-    ]
-})
 # --- Client Initialization ---
 client = None
 if GEMINI_API_KEY and GENAI_AVAILABLE:
@@ -163,189 +151,11 @@ def get_all_schemas_representation(dataframes: Dict[str, pd.DataFrame]) -> str:
         full_representation.append(get_df_schema_representation(df_instance, name))
     return "\n".join(full_representation)
-class AdvancedRAGSystem:
-    def __init__(self, documents_df: pd.DataFrame, embedding_model_name: str):
-        self.documents_df = documents_df.copy() if not documents_df.empty else DEFAULT_RAG_DOCUMENTS.copy()
-        # Ensure 'text' column exists
-        if 'text' not in self.documents_df.columns and not self.documents_df.empty:
-            logging.warning("'text' column not found in RAG documents. RAG might not work.")
-            self.documents_df['text'] = ""
-        self.embedding_model_name = embedding_model_name
-        self.embeddings: Optional[np.ndarray] = None
-        self.is_initialized = False
-        logging.info(f"AdvancedRAGSystem initialized with {len(self.documents_df)} documents. Model: {self.embedding_model_name}")
-    def _embed_single_document_sync(self, text: str) -> Optional[np.ndarray]:
-        if not client:
-            raise ConnectionError("GenAI client not initialized for RAG embedding.")
-        if not text or not isinstance(text, str):
-            logging.warning("Cannot embed empty or non-string text for RAG.")
-            return None
-        try:
-            embed_config_payload = None
-            if GENAI_AVAILABLE and hasattr(types, 'EmbedContentConfig'):
-                embed_config_payload = types.EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT")
-            response = client.models.embed_content(
-                model=f"models/{self.embedding_model_name}" if not self.embedding_model_name.startswith("models/") else self.embedding_model_name,
-                contents=text,
-                config=embed_config_payload
-            )
-            # Fix: Handle ContentEmbedding objects properly
-            if hasattr(response, 'embeddings') and isinstance(response.embeddings, list) and len(response.embeddings) > 0:
-                embedding_obj = response.embeddings[0]
-                # Extract values from ContentEmbedding object
-                if hasattr(embedding_obj, 'values'):
-                    embedding_values = embedding_obj.values
-                elif hasattr(embedding_obj, 'embedding'):
-                    embedding_values = embedding_obj.embedding
-                elif isinstance(embedding_obj, (list, tuple)):
-                    embedding_values = embedding_obj
-                else:
-                    # Try to convert to list/array if it's a different object type
-                    try:
-                        embedding_values = list(embedding_obj)
-                    except:
-                        logging.error(f"Cannot extract embedding values from object type: {type(embedding_obj)}")
-                        return None
-                return np.array(embedding_values, dtype=np.float32)
-            else:
-                logging.error(f"Unexpected response structure")
-                return None
-        except Exception as e:
-            logging.error(f"Error in _embed_single_document_sync for text '{text[:50]}...': {e}", exc_info=True)
-            raise
-    async def initialize_embeddings(self):
-        if self.documents_df.empty or 'text' not in self.documents_df.columns:
-            logging.warning("RAG documents DataFrame is empty or lacks 'text' column. Skipping embedding.")
-            self.embeddings = np.array([])
-            self.is_initialized = True
-            return
-        if not client and not (GENAI_AVAILABLE and os.getenv('GEMINI_API_KEY')):
-            logging.error("GenAI client not available for RAG embedding initialization.")
-            self.embeddings = np.array([])
-            return
-        logging.info(f"Starting RAG document embedding for {len(self.documents_df)} documents...")
-        embedded_docs_list = []
-        for index, row in self.documents_df.iterrows():
-            text_to_embed = row.get('text', '')
-            if not text_to_embed or not isinstance(text_to_embed, str):
-                logging.warning(f"Skipping RAG document at index {index} due to invalid/empty text.")
-                continue
-            try:
-                embedding_array = await asyncio.to_thread(self._embed_single_document_sync, text_to_embed)
-                if embedding_array is not None and embedding_array.size > 0:
-                    embedded_docs_list.append(embedding_array)
-                else:
-                    logging.warning(f"Empty or failed embedding for RAG document at index {index}.")
-            except Exception as e:
-                logging.error(f"Error embedding RAG document at index {index}: {e}")
-                continue
-        if not embedded_docs_list:
-            self.embeddings = np.array([])
-            logging.warning("No RAG documents were successfully embedded.")
-        else:
-            try:
-                first_shape = embedded_docs_list[0].shape
-                if not all(emb.shape == first_shape for emb in embedded_docs_list):
-                    logging.error("Inconsistent embedding shapes found. Cannot stack for RAG.")
-                    self.embeddings = np.array([])
-                    return
-                self.embeddings = np.vstack(embedded_docs_list)
-                logging.info(f"Successfully embedded {len(embedded_docs_list)} RAG documents. Embeddings shape: {self.embeddings.shape}")
-            except ValueError as ve:
-                logging.error(f"Error stacking embeddings: {ve}")
-                self.embeddings = np.array([])
-        self.is_initialized = True
-    def _calculate_cosine_similarity(self, embeddings_matrix: np.ndarray, query_vector: np.ndarray) -> np.ndarray:
-        # Ensure inputs are numpy arrays with proper dtype
-        embeddings_matrix = np.asarray(embeddings_matrix, dtype=np.float32)
-        query_vector = np.asarray(query_vector, dtype=np.float32)
-        if embeddings_matrix.ndim == 1:
-            embeddings_matrix = embeddings_matrix.reshape(1, -1)
-        if query_vector.ndim == 1:
-            query_vector = query_vector.reshape(1, -1)
-        if embeddings_matrix.size == 0 or query_vector.size == 0:
-            return np.array([])
-        norm_matrix = np.linalg.norm(embeddings_matrix, axis=1, keepdims=True)
-        normalized_embeddings_matrix = np.divide(embeddings_matrix, norm_matrix + 1e-8, where=norm_matrix!=0)
-        norm_query = np.linalg.norm(query_vector, axis=1, keepdims=True)
-        normalized_query_vector = np.divide(query_vector, norm_query + 1e-8, where=norm_query!=0)
-        return np.dot(normalized_embeddings_matrix, normalized_query_vector.T).flatten()
-    async def retrieve_relevant_info(self, query: str, top_k: int = 3, min_similarity: float = 0.3) -> str:
-        if not self.is_initialized:
-            logging.debug("RAG system not initialized. Cannot retrieve info.")
-            return ""
-        if self.embeddings is None or self.embeddings.size == 0:
-            logging.debug("RAG embeddings not available. Cannot retrieve info.")
-            return ""
-        if not query or not isinstance(query, str):
-            logging.debug("Empty or invalid query for RAG retrieval.")
-            return ""
-        if not client and not (GENAI_AVAILABLE and os.getenv('GEMINI_API_KEY')):
-            logging.error("GenAI client not available for RAG query embedding.")
-            return ""
-        try:
-            query_vector = await asyncio.to_thread(self._embed_single_document_sync, query)
-            if query_vector is None or query_vector.size == 0:
-                logging.warning("Query vector embedding failed or is empty for RAG.")
-                return ""
-            similarity_scores = self._calculate_cosine_similarity(self.embeddings, query_vector)
-            if similarity_scores.size == 0:
-                return ""
-            relevant_indices = np.where(similarity_scores >= min_similarity)[0]
-            if len(relevant_indices) == 0:
-                logging.debug(f"No RAG documents met minimum similarity threshold of {min_similarity} for query: '{query[:50]}...'")
-                return ""
-            relevant_scores = similarity_scores[relevant_indices]
-            sorted_relevant_indices_of_original = relevant_indices[np.argsort(relevant_scores)[::-1]]
-            top_indices = sorted_relevant_indices_of_original[:top_k]
-            context_parts = []
-            if 'text' in self.documents_df.columns:
-                for i in top_indices:
-                    if 0 <= i < len(self.documents_df):
-                        context_parts.append(self.documents_df.iloc[i]['text'])
-            context = "\n\n---\n\n".join(context_parts)
-            logging.debug(f"Retrieved RAG context with {len(context_parts)} documents for query: '{query[:50]}...'")
-            return context
-        except Exception as e:
-            logging.error(f"Error during RAG retrieval for query '{query[:50]}...': {e}", exc_info=True)
-            return ""
 class EmployerBrandingAgent:
     def __init__(self,
                  all_dataframes: Optional[Dict[str, pd.DataFrame]] = None,
-                 rag_documents_df: Optional[pd.DataFrame] = None,
                  llm_model_name: str = LLM_MODEL_NAME,
                  embedding_model_name: str = GEMINI_EMBEDDING_MODEL_NAME,
                  generation_config_dict: Optional[Dict] = None,
@@ -353,9 +163,6 @@ class EmployerBrandingAgent:
         self.all_dataframes = {k: v.copy() for k, v in (all_dataframes or {}).items()}
-        _rag_docs_df = rag_documents_df if rag_documents_df is not None else DEFAULT_RAG_DOCUMENTS.copy()
-        self.rag_system = AdvancedRAGSystem(_rag_docs_df, embedding_model_name)
         self.llm_model_name = llm_model_name
         self.generation_config_dict = generation_config_dict or GENERATION_CONFIG_PARAMS
         self.safety_settings_list = safety_settings_list or DEFAULT_SAFETY_SETTINGS
@@ -371,8 +178,6 @@ class EmployerBrandingAgent:
         self.pandas_agent = None
         self._initialize_pandas_agent()
-        logging.info(f"EnhancedEmployerBrandingAgent initialized. LLM: {self.llm_model_name}. RAG docs: {len(self.rag_system.documents_df)}. DataFrames: {list(self.all_dataframes.keys())}")
     def _initialize_pandas_agent(self):
         """Initialize PandasAI with enhanced configuration for chart generation"""
         if not self.all_dataframes or not GEMINI_API_KEY:
@@ -475,8 +280,7 @@ class EmployerBrandingAgent:
             if not client:  # Fix: Remove reference to llm_model_instance
                 logging.error("Cannot initialize agent: GenAI client not available/configured.")
                 return False
-            await self.rag_system.initialize_embeddings()
             # Verify PandasAI agent is ready
             pandas_ready = self.pandas_agent is not None
@@ -485,8 +289,6 @@ class EmployerBrandingAgent:
                 self._initialize_pandas_agent()
                 pandas_ready = self.pandas_agent is not None
-            self.is_ready = self.rag_system.is_initialized and pandas_ready
-            logging.info(f"EnhancedEmployerBrandingAgent.initialize completed. RAG: {self.rag_system.is_initialized}, PandasAI: {pandas_ready}, Agent ready: {self.is_ready}")
             return self.is_ready
         except Exception as e:
@@ -814,7 +616,6 @@ class EmployerBrandingAgent:
         try:
             system_prompt = self._build_system_prompt()
             data_summary = self._get_dataframes_summary()
-            rag_context = await self.rag_system.retrieve_relevant_info(query, top_k=2, min_similarity=0.25)
             # Build enhanced prompt based on query type and available results
             if query_type == "data" and pandas_result:
@@ -828,7 +629,6 @@ class EmployerBrandingAgent:
                 {pandas_result}
                 ## Additional Knowledge Context:
-                {rag_context if rag_context else 'No additional context retrieved.'}
                 ## User Query:
                 {query}
@@ -844,7 +644,6 @@ class EmployerBrandingAgent:
                 {data_summary}
                 ## Knowledge Base Context:
-                {rag_context if rag_context else 'No specific background information retrieved.'}
                 ## User Query:
                 {query}
@@ -1005,14 +804,7 @@ class EmployerBrandingAgent:
         # Reinitialize PandasAI agent with new data
         self._initialize_pandas_agent()
-        # Note: RAG system uses static documents and doesn't need reinitialization
-    def update_rag_documents(self, new_rag_df: pd.DataFrame):
-        """Updates RAG documents and reinitializes embeddings"""
-        self.rag_system.documents_df = new_rag_df.copy()
-        logging.info(f"RAG documents updated. Count: {len(new_rag_df)}")
-        # Note: Embeddings will need to be reinitialized - call initialize() after this
     def clear_chat_history(self):
         """Clears the agent's internal chat history"""
@@ -1026,13 +818,10 @@ class EmployerBrandingAgent:
             "has_api_key": bool(GEMINI_API_KEY),
             "genai_available": GENAI_AVAILABLE,
             "client_type": "genai.Client" if client else "None",  # Fix: Remove reference to llm_model_instance
-            "rag_initialized": self.rag_system.is_initialized,
             "pandas_agent_ready": self.pandas_agent is not None,
             "num_dataframes": len(self.all_dataframes),
             "dataframe_keys": list(self.all_dataframes.keys()),
-            "num_rag_documents": len(self.rag_system.documents_df) if self.rag_system.documents_df is not None else 0,
             "llm_model_name": self.llm_model_name,
-            "embedding_model_name": self.rag_system.embedding_model_name,
             "chat_history_length": len(self.chat_history),
             "charts_save_path_pandasai": pai.config.save_charts_path if pai.config.llm else "PandasAI not configured"
         }
@@ -1074,11 +863,10 @@ class EmployerBrandingAgent:
         return suggestions[:10]  # Limit to top 10 suggestions
 # --- Helper Functions for External Integration ---
-def create_agent_instance(dataframes: Optional[Dict[str, pd.DataFrame]] = None,
-                          rag_docs: Optional[pd.DataFrame] = None) -> EmployerBrandingAgent:
     """Factory function to create a new agent instance"""
     logging.info("Creating new EnhancedEmployerBrandingAgent instance via helper function.")
-    return EmployerBrandingAgent(all_dataframes=dataframes, rag_documents_df=rag_docs)
 async def initialize_agent_async(agent: EmployerBrandingAgent) -> bool:
     """Async helper to initialize an agent instance"""

 DEFAULT_SAFETY_SETTINGS = []
 # --- Client Initialization ---
 client = None
 if GEMINI_API_KEY and GENAI_AVAILABLE:
         full_representation.append(get_df_schema_representation(df_instance, name))
     return "\n".join(full_representation)
 class EmployerBrandingAgent:
     def __init__(self,
                  all_dataframes: Optional[Dict[str, pd.DataFrame]] = None,
                  llm_model_name: str = LLM_MODEL_NAME,
                  embedding_model_name: str = GEMINI_EMBEDDING_MODEL_NAME,
                  generation_config_dict: Optional[Dict] = None,
         self.all_dataframes = {k: v.copy() for k, v in (all_dataframes or {}).items()}
         self.llm_model_name = llm_model_name
         self.generation_config_dict = generation_config_dict or GENERATION_CONFIG_PARAMS
         self.safety_settings_list = safety_settings_list or DEFAULT_SAFETY_SETTINGS
         self.pandas_agent = None
         self._initialize_pandas_agent()
     def _initialize_pandas_agent(self):
         """Initialize PandasAI with enhanced configuration for chart generation"""
         if not self.all_dataframes or not GEMINI_API_KEY:
             if not client:  # Fix: Remove reference to llm_model_instance
                 logging.error("Cannot initialize agent: GenAI client not available/configured.")
                 return False
             # Verify PandasAI agent is ready
             pandas_ready = self.pandas_agent is not None
                 self._initialize_pandas_agent()
                 pandas_ready = self.pandas_agent is not None
             return self.is_ready
         except Exception as e:
         try:
             system_prompt = self._build_system_prompt()
             data_summary = self._get_dataframes_summary()
             # Build enhanced prompt based on query type and available results
             if query_type == "data" and pandas_result:
                 {pandas_result}
                 ## Additional Knowledge Context:
                 ## User Query:
                 {query}
                 {data_summary}
                 ## Knowledge Base Context:
                 ## User Query:
                 {query}
         # Reinitialize PandasAI agent with new data
         self._initialize_pandas_agent()
     def clear_chat_history(self):
         """Clears the agent's internal chat history"""
             "has_api_key": bool(GEMINI_API_KEY),
             "genai_available": GENAI_AVAILABLE,
             "client_type": "genai.Client" if client else "None",  # Fix: Remove reference to llm_model_instance
             "pandas_agent_ready": self.pandas_agent is not None,
             "num_dataframes": len(self.all_dataframes),
             "dataframe_keys": list(self.all_dataframes.keys()),
             "llm_model_name": self.llm_model_name,
             "chat_history_length": len(self.chat_history),
             "charts_save_path_pandasai": pai.config.save_charts_path if pai.config.llm else "PandasAI not configured"
         }
         return suggestions[:10]  # Limit to top 10 suggestions
 # --- Helper Functions for External Integration ---
+def create_agent_instance(dataframes: Optional[Dict[str, pd.DataFrame]] = None) -> EmployerBrandingAgent:
     """Factory function to create a new agent instance"""
     logging.info("Creating new EnhancedEmployerBrandingAgent instance via helper function.")
+    return EmployerBrandingAgent(all_dataframes=dataframes)
 async def initialize_agent_async(agent: EmployerBrandingAgent) -> bool:
     """Async helper to initialize an agent instance"""