Spaces:

Aedelon
/

GAIA_Agent

Running

GAIA_Agent / agents /image_analyzer_agent.py

Delanoe Pirard

Lot's of changes

114747f 11 days ago

3.97 kB

	import os
	import logging

	from llama_index.core.agent.workflow import FunctionAgent
	from llama_index.llms.google_genai import GoogleGenAI

	# Setup logging
	logger = logging.getLogger(__name__)

	# Helper function to load prompt from file
	def load_prompt_from_file(filename="../prompts/image_analyzer_prompt.txt") -> str:
	"""Loads the system prompt from a text file."""
	try:
	# Assuming the prompt file is in the same directory as the agent script
	script_dir = os.path.dirname(__file__)
	prompt_path = os.path.join(script_dir, filename)
	with open(prompt_path, "r") as f:
	prompt = f.read()
	logger.info(f"Successfully loaded system prompt from {prompt_path}")
	return prompt
	except FileNotFoundError:
	logger.error(f"Prompt file {filename} not found at {prompt_path}. Using fallback prompt.")
	# Fallback minimal prompt
	return "You are an image analyzer. Describe the image factually."
	except Exception as e:
	logger.error(f"Error loading prompt file {filename}: {e}", exc_info=True)
	return "You are an image analyzer. Describe the image factually."

	def initialize_image_analyzer_agent() -> FunctionAgent:
	"""
	Create an agent that orchestrates image analysis.
	Uses Gemini Pro multimodal capabilities directly without explicit tools.
	Configuration and prompt are loaded from environment/file.
	"""
	logger.info("Initializing ImageAnalyzerAgent...")

	# Configuration from environment variables
	llm_model_name = os.getenv("IMAGE_ANALYZER_LLM_MODEL", "gemini-2.5-pro-preview-03-25")
	gemini_api_key = os.getenv("GEMINI_API_KEY")

	if not gemini_api_key:
	logger.error("GEMINI_API_KEY not found in environment variables.")
	raise ValueError("GEMINI_API_KEY must be set")

	try:
	llm = GoogleGenAI(
	api_key=gemini_api_key,
	model=llm_model_name,
	)
	logger.info(f"Using LLM model: {llm_model_name}")

	# Load system prompt from file
	system_prompt = load_prompt_from_file()

	# Note: This agent is a FunctionAgent but doesn't explicitly define tools.
	# It relies on the LLM (Gemini 1.5 Pro) to understand the system prompt
	# and perform the analysis when an image is passed in the ChatMessage blocks.
	agent = FunctionAgent(
	name="image_analyzer_agent",
	description=(
	"ImageAnalyzerAgent inspects image files using its multimodal capabilities, "
	"interpreting the visual content according to a detailed factual analysis prompt."
	),
	llm=llm,
	system_prompt=system_prompt,
	# No explicit tools needed if relying on direct multimodal LLM call
	# tools=[],
	can_handoff_to=["planner_agent", "research_agent", "reasoning_agent", "figure_interpretation_agent"],
	)
	logger.info("ImageAnalyzerAgent initialized successfully.")
	return agent
	except Exception as e:
	logger.error(f"Error during ImageAnalyzerAgent initialization: {e}", exc_info=True)
	raise

	# Example usage (for testing if run directly)
	if __name__ == "__main__":
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	logger.info("Running image_analyzer_agent.py directly for testing...")

	# Ensure API key is set for testing
	if not os.getenv("GEMINI_API_KEY"):
	print("Error: GEMINI_API_KEY environment variable not set. Cannot run test.")
	else:
	try:
	test_agent = initialize_image_analyzer_agent()
	print("Image Analyzer Agent initialized successfully for testing.")
	# To test further, you would need to construct a ChatMessage with an ImageBlock
	# and run agent.chat(message)
	except Exception as e:
	print(f"Error during testing: {e}")