Spaces:

Aedelon
/

GAIA_Agent

Running

App Files Files Community

GAIA_Agent / agents /figure_interpretation_agent.py

Aedelon

agent enhancement (#3)

b8f6b7f verified 4 months ago

raw

history blame

14.7 kB

	import os
	import logging

	from llama_index.core.agent.workflow import ReActAgent
	from llama_index.core.schema import ImageDocument
	from llama_index.core.tools import FunctionTool
	from llama_index.llms.google_genai import GoogleGenAI

	# Setup logging
	logger = logging.getLogger(__name__)

	# Helper function to load prompt from file
	def load_prompt_from_file(filename: str, default_prompt: str) -> str:
	"""Loads a prompt from a text file."""
	try:
	script_dir = os.path.dirname(__file__)
	prompt_path = os.path.join(script_dir, filename)
	with open(prompt_path, "r") as f:
	prompt = f.read()
	logger.info(f"Successfully loaded prompt from {prompt_path}")
	return prompt
	except FileNotFoundError:
	logger.warning(f"Prompt file {filename} not found at {prompt_path}. Using default.")
	return default_prompt
	except Exception as e:
	logger.error(f"Error loading prompt file {filename}: {e}", exc_info=True)
	return default_prompt

	# --- Core Figure Interpretation Logic (using Multi-Modal LLM) ---

	def interpret_figure_with_llm(image_path: str, request: str) -> str:
	"""Interprets a figure in an image based on a specific request using a multi-modal LLM.
	Args:
	image_path (str): Path to the image file containing the figure.
	request (str): The specific question or interpretation task (e.g., "Describe this chart",
	"Extract sales for Q3", "Identify the main trend").
	Returns:
	str: The interpretation result or an error message.
	"""
	logger.info(f"Interpreting figure in image: {image_path} with request: {request}")

	# Check if image exists
	if not os.path.exists(image_path):
	logger.error(f"Image file not found: {image_path}")
	return f"Error: Image file not found at {image_path}"

	# LLM configuration (Must be a multi-modal model)
	# Ensure the selected model supports image input (e.g., gemini-1.5-pro)
	llm_model_name = os.getenv("FIGURE_INTERPRETATION_LLM_MODEL", "models/gemini-1.5-pro")
	gemini_api_key = os.getenv("GEMINI_API_KEY")
	if not gemini_api_key:
	logger.error("GEMINI_API_KEY not found for figure interpretation LLM.")
	return "Error: GEMINI_API_KEY not set."

	try:
	# Initialize the multi-modal LLM
	llm = GoogleGenAI(api_key=gemini_api_key, model=llm_model_name)
	logger.info(f"Using figure interpretation LLM: {llm_model_name}")

	# Prepare the prompt for the multi-modal LLM
	# The prompt needs to guide the LLM to act as the figure interpreter
	# based on the specific request.
	prompt = (
	f"You are an expert figure interpreter. Analyze the provided image containing a chart, graph, diagram, or table. "
	f"Focus only on the visual information present in the image. "
	f"Fulfill the following request accurately and concisely:\n\n"
	f"REQUEST: {request}\n\n"
	f"Based only on the image, provide the answer:"
	)

	# Load the image data (LlamaIndex integration might handle this differently depending on version)
	# Assuming a method to load image data compatible with the LLM call
	# This might involve using ImageBlock or similar structures in newer LlamaIndex versions.
	# For simplicity here, we assume the LLM call can handle a path or loaded image object.

	# Example using complete (adjust based on actual LlamaIndex multi-modal API)
	# Note: The exact API for multi-modal completion might vary.
	# This is a conceptual example.
	from llama_index.core import SimpleDirectoryReader # Example import

	# Load the image document
	reader = SimpleDirectoryReader(input_files=[image_path])
	image_documents = reader.load_data()

	if not image_documents or not isinstance(image_documents[0], ImageDocument):
	logger.error(f"Failed to load image as ImageDocument: {image_path}")
	return f"Error: Could not load image file {image_path} for analysis."

	# Make the multi-modal completion call
	response = llm.complete(
	prompt=prompt,
	image_documents=image_documents # Pass the loaded image document(s)
	)

	interpretation = response.text.strip()
	logger.info("Figure interpretation successful.")
	return interpretation

	except FileNotFoundError:
	# This might be redundant due to the initial check, but good practice
	logger.error(f"Image file not found during LLM call: {image_path}")
	return f"Error: Image file not found at {image_path}"
	except ImportError as ie:
	logger.error(f"Missing library for multi-modal processing: {ie}")
	return f"Error: Missing required library for image processing ({ie})."
	except Exception as e:
	# Catch potential API errors or other issues
	logger.error(f"LLM call failed during figure interpretation: {e}", exc_info=True)
	# Check if the error suggests the model doesn't support images
	if "does not support image input" in str(e).lower():
	logger.error(f"The configured model {llm_model_name} does not support image input.")
	return f"Error: The configured LLM ({llm_model_name}) does not support image input. Please configure a multi-modal model."
	return f"Error during figure interpretation: {e}"

	# --- Tool Definitions (Wrapping the core logic) ---
	# These tools essentially pass the request to the core LLM function.

	def describe_figure_tool_fn(image_path: str) -> str:
	"Provides a general description of the figure in the image (type, elements, topic)."
	return interpret_figure_with_llm(image_path, "Describe this figure, including its type, main elements (axes, labels, legend), and overall topic.")

	def extract_data_points_tool_fn(image_path: str, data_request: str) -> str:
	"Extracts specific data points or values from the figure in the image."
	return interpret_figure_with_llm(image_path, f"Extract the following data points/values from the figure: {data_request}. If exact values are not clear, provide the closest estimate based on the visual.")

	def identify_trends_tool_fn(image_path: str) -> str:
	"Identifies and describes trends or patterns shown in the figure in the image."
	return interpret_figure_with_llm(image_path, "Analyze and describe the main trends or patterns shown in this figure.")

	def compare_elements_tool_fn(image_path: str, comparison_request: str) -> str:
	"Compares different elements within the figure in the image."
	return interpret_figure_with_llm(image_path, f"Compare the following elements within the figure: {comparison_request}. Be specific about the comparison based on the visual data.")

	def summarize_figure_insights_tool_fn(image_path: str) -> str:
	"Summarizes the key insights or main message conveyed by the figure in the image."
	return interpret_figure_with_llm(image_path, "Summarize the key insights or the main message conveyed by this figure.")

	# --- Tool Definitions for Agent ---
	describe_figure_tool = FunctionTool.from_defaults(
	fn=describe_figure_tool_fn,
	name="describe_figure",
	description="Provides a general description of the figure in the image (type, elements, topic). Input: image_path (str)."
	)

	extract_data_points_tool = FunctionTool.from_defaults(
	fn=extract_data_points_tool_fn,
	name="extract_data_points",
	description="Extracts specific data points/values from the figure. Input: image_path (str), data_request (str)."
	)

	identify_trends_tool = FunctionTool.from_defaults(
	fn=identify_trends_tool_fn,
	name="identify_trends",
	description="Identifies and describes trends/patterns in the figure. Input: image_path (str)."
	)

	compare_elements_tool = FunctionTool.from_defaults(
	fn=compare_elements_tool_fn,
	name="compare_elements",
	description="Compares different elements within the figure. Input: image_path (str), comparison_request (str)."
	)

	summarize_figure_insights_tool = FunctionTool.from_defaults(
	fn=summarize_figure_insights_tool_fn,
	name="summarize_figure_insights",
	description="Summarizes the key insights/main message of the figure. Input: image_path (str)."
	)

	# --- Agent Initialization ---
	def initialize_figure_interpretation_agent() -> ReActAgent:
	"""Initializes the Figure Interpretation Agent."""
	logger.info("Initializing FigureInterpretationAgent...")

	# Configuration for the agent's main LLM (can be the same multi-modal one)
	agent_llm_model = os.getenv("FIGURE_INTERPRETATION_AGENT_LLM_MODEL", "models/gemini-1.5-pro")
	gemini_api_key = os.getenv("GEMINI_API_KEY")

	if not gemini_api_key:
	logger.error("GEMINI_API_KEY not found for FigureInterpretationAgent.")
	raise ValueError("GEMINI_API_KEY must be set for FigureInterpretationAgent")

	try:
	# Agent's LLM doesn't necessarily need to be multi-modal itself,
	# if the tools handle the multi-modal calls.
	# However, using a multi-modal one might allow more direct interaction patterns later.
	llm = GoogleGenAI(api_key=gemini_api_key, model=agent_llm_model)
	logger.info(f"Using agent LLM: {agent_llm_model}")

	# Load system prompt
	default_system_prompt = ("You are FigureInterpretationAgent... [Default prompt content - replace with actual]" # Placeholder
	)
	system_prompt = load_prompt_from_file("../prompts/figure_interpretation_agent_prompt.txt", default_system_prompt)
	if system_prompt == default_system_prompt:
	logger.warning("Using default/fallback system prompt for FigureInterpretationAgent.")

	# Define available tools
	tools = [
	describe_figure_tool,
	extract_data_points_tool,
	identify_trends_tool,
	compare_elements_tool,
	summarize_figure_insights_tool
	]

	# Define valid handoff targets
	valid_handoffs = [
	"planner_agent", # To return results
	"research_agent", # If context from figure needs further research
	"reasoning_agent" # If interpretation needs logical analysis
	]

	agent = ReActAgent(
	name="figure_interpretation_agent",
	description=(
	"Analyzes and interprets visual data representations (charts, graphs, tables) from image files. "
	"Can describe figures, extract data, identify trends, compare elements, and summarize insights."
	),
	tools=tools,
	llm=llm,
	system_prompt=system_prompt,
	can_handoff_to=valid_handoffs,
	# Note: This agent inherently requires multi-modal input capabilities,
	# which are handled within its tools via a multi-modal LLM.
	)
	logger.info("FigureInterpretationAgent initialized successfully.")
	return agent

	except Exception as e:
	logger.error(f"Error during FigureInterpretationAgent initialization: {e}", exc_info=True)
	raise

	# Example usage (for testing if run directly)
	if __name__ == "__main__":
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	logger.info("Running figure_interpretation_agent.py directly for testing...")

	# Check required keys
	required_keys = ["GEMINI_API_KEY"]
	missing_keys = [key for key in required_keys if not os.getenv(key)]
	if missing_keys:
	print(f"Error: Required environment variable(s) not set: {', '.join(missing_keys)}. Cannot run test.")
	else:
	# Check if a multi-modal model is likely configured (heuristic)
	model_name = os.getenv("FIGURE_INTERPRETATION_LLM_MODEL", "models/gemini-1.5-pro")
	if "pro" not in model_name.lower() and "vision" not in model_name.lower():
	print(f"Warning: Configured LLM {model_name} might not support image input. Tests may fail.")

	# Create a dummy image file for testing (requires Pillow)
	dummy_image_path = "dummy_figure.png"
	try:
	from PIL import Image, ImageDraw, ImageFont
	img = Image.new('RGB', (400, 200), color = (255, 255, 255))
	d = ImageDraw.Draw(img)
	# Try to load a default font, handle if not found
	try:
	font = ImageFont.truetype("arial.ttf", 15) # Common font, might not exist
	except IOError:
	font = ImageFont.load_default()
	print("Arial font not found, using default PIL font.")
	d.text((10,10), "Simple Bar Chart", fill=(0,0,0), font=font)
	d.rectangle([50, 50, 100, 150], fill=(255,0,0)) # Bar 1
	d.text((60, 160), "A", fill=(0,0,0), font=font)
	d.rectangle([150, 80, 200, 150], fill=(0,0,255)) # Bar 2
	d.text((160, 160), "B", fill=(0,0,0), font=font)
	img.save(dummy_image_path)
	print(f"Created dummy image file: {dummy_image_path}")

	# Test the tools directly
	print("\nTesting describe_figure...")
	desc = describe_figure_tool_fn(dummy_image_path)
	print(f"Description: {desc}")

	print("\nTesting extract_data_points (qualitative)...")
	extract_req = "Height of bar A vs Bar B" # Qualitative request
	extract_res = extract_data_points_tool_fn(dummy_image_path, extract_req)
	print(f"Extraction Result: {extract_res}")

	print("\nTesting compare_elements...")
	compare_req = "Compare bar A and bar B"
	compare_res = compare_elements_tool_fn(dummy_image_path, compare_req)
	print(f"Comparison Result: {compare_res}")

	# Clean up dummy image
	os.remove(dummy_image_path)

	except ImportError:
	print("Pillow library not installed. Skipping direct tool tests that require image creation.")
	# Optionally, still try initializing the agent
	try:
	test_agent = initialize_figure_interpretation_agent()
	print("\nFigure Interpretation Agent initialized successfully (tool tests skipped).")
	except Exception as e:
	print(f"Error initializing agent: {e}")
	except Exception as e:
	print(f"Error during testing: {e}")
	if os.path.exists(dummy_image_path):
	os.remove(dummy_image_path) # Ensure cleanup on error