Spaces:

Aedelon
/

GAIA_Agent

Sleeping

File size: 3,969 Bytes

import os
import logging

from llama_index.core.agent.workflow import FunctionAgent
from llama_index.llms.google_genai import GoogleGenAI

# Setup logging
logger = logging.getLogger(__name__)

# Helper function to load prompt from file
def load_prompt_from_file(filename="../prompts/image_analyzer_prompt.txt") -> str:
    """Loads the system prompt from a text file."""
    try:
        # Assuming the prompt file is in the same directory as the agent script
        script_dir = os.path.dirname(__file__)
        prompt_path = os.path.join(script_dir, filename)
        with open(prompt_path, "r") as f:
            prompt = f.read()
            logger.info(f"Successfully loaded system prompt from {prompt_path}")
            return prompt
    except FileNotFoundError:
        logger.error(f"Prompt file {filename} not found at {prompt_path}. Using fallback prompt.")
        # Fallback minimal prompt
        return "You are an image analyzer. Describe the image factually."
    except Exception as e:
        logger.error(f"Error loading prompt file {filename}: {e}", exc_info=True)
        return "You are an image analyzer. Describe the image factually."

def initialize_image_analyzer_agent() -> FunctionAgent:
    """
    Create an agent that orchestrates image analysis.
    Uses Gemini Pro multimodal capabilities directly without explicit tools.
    Configuration and prompt are loaded from environment/file.
    """
    logger.info("Initializing ImageAnalyzerAgent...")
    
    # Configuration from environment variables
    llm_model_name = os.getenv("IMAGE_ANALYZER_LLM_MODEL", "gemini-2.5-pro-preview-03-25")
    gemini_api_key = os.getenv("GEMINI_API_KEY")

    if not gemini_api_key:
        logger.error("GEMINI_API_KEY not found in environment variables.")
        raise ValueError("GEMINI_API_KEY must be set")

    try:
        llm = GoogleGenAI(
            api_key=gemini_api_key,
            model=llm_model_name,
        )
        logger.info(f"Using LLM model: {llm_model_name}")

        # Load system prompt from file
        system_prompt = load_prompt_from_file()

        # Note: This agent is a FunctionAgent but doesn't explicitly define tools.
        # It relies on the LLM (Gemini 1.5 Pro) to understand the system prompt
        # and perform the analysis when an image is passed in the ChatMessage blocks.
        agent = FunctionAgent(
            name="image_analyzer_agent",
            description=(
                "ImageAnalyzerAgent inspects image files using its multimodal capabilities, "
                "interpreting the visual content according to a detailed factual analysis prompt."
            ),
            llm=llm,
            system_prompt=system_prompt,
            # No explicit tools needed if relying on direct multimodal LLM call
            # tools=[], 
            can_handoff_to=["planner_agent", "research_agent", "reasoning_agent", "figure_interpretation_agent"],
        )
        logger.info("ImageAnalyzerAgent initialized successfully.")
        return agent
    except Exception as e:
        logger.error(f"Error during ImageAnalyzerAgent initialization: {e}", exc_info=True)
        raise

# Example usage (for testing if run directly)
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logger.info("Running image_analyzer_agent.py directly for testing...")
    
    # Ensure API key is set for testing
    if not os.getenv("GEMINI_API_KEY"):
        print("Error: GEMINI_API_KEY environment variable not set. Cannot run test.")
    else:
        try:
            test_agent = initialize_image_analyzer_agent()
            print("Image Analyzer Agent initialized successfully for testing.")
            # To test further, you would need to construct a ChatMessage with an ImageBlock
            # and run agent.chat(message)
        except Exception as e:
            print(f"Error during testing: {e}")