Final_Assignment_Agent

Paused

Martin Bär

Use Gemini's Vision Capabilities directly instead of dedicated image tool

9f1ce17 about 2 months ago

10.4 kB

	import os
	import re
	import asyncio

	from llama_index.core.tools import FunctionTool
	from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
	from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
	from llama_index.tools.wikipedia import WikipediaToolSpec
	from langfuse.llama_index import LlamaIndexInstrumentor
	from llama_index.llms.ollama import Ollama
	from llama_index.llms.google_genai import GoogleGenAI
	from llama_index.core.llms import ChatMessage, TextBlock, ImageBlock, MessageRole
	from llama_index.core.agent.workflow import FunctionAgent, AgentWorkflow
	from llama_index.core.agent.workflow import (
	AgentOutput,
	ToolCall,
	ToolCallResult,
	)

	from multimodality_tools import get_image_qa_tool, get_transcription_tool, \
	get_excel_analysis_tool, get_excel_tool, get_csv_analysis_tool, get_csv_tool, _get_file, \
	get_read_file_tool, _build_file_url
	from web_tools import get_search_web_tool
	from wiki_tool import CustomWikipediaToolSpec

	answer_specifics = ("When answering, provide ONLY the precise answer requested. "
	"Do not include explanations, steps, reasoning, or additional text. Be direct and specific. "
	'For example, if asked "What is the capital of France?", respond simply with "Paris".')

	class BasicAgent:
	def __init__(self, ollama=False, langfuse=False):
	if not ollama:
	llm = GoogleGenAI(model="gemini-2.0-flash", api_key=os.getenv("GEMINI_API_KEY"))
	# llm = HuggingFaceInferenceAPI(model_name="Qwen/Qwen3-32B") #"Qwen/Qwen2.5-Coder-32B-Instruct")
	else:
	llm = Ollama(model="mistral:latest", request_timeout=120.0)

	# Langfuse
	self.langfuse = langfuse
	if self.langfuse:
	self.instrumentor = LlamaIndexInstrumentor()
	self.instrumentor.start()

	# Initialize sub-agents

	main_agent = FunctionAgent(
	name="MainAgent",
	description= ("Can organize and delegate work to different agents and can compile a final answer to a question from other agents' outputs."),
	system_prompt=(
	"You are a general AI assistant. I will ask you a question. "
	"Report your thoughts, delegate work to other agents if necessary, and"
	"finish your answer with the following template:\n"
	"FINAL ANSWER: [YOUR FINAL ANSWER]. \nYOUR FINAL ANSWER should be a number "
	"OR as few words as possible OR a comma separated list of numbers and/or "
	"strings. If you are asked for a number, don't use comma to write your "
	"number neither use units such as $ or percent sign unless specified otherwise. "
	"If you are asked for a string, don't use articles, neither abbreviations (e.g. "
	"for cities), and write the digits in plain text unless specified otherwise. If "
	"you are asked for a comma separated list, apply the above rules depending of "
	"whether the element to be put in the list is a number or a string."
	),
	llm=llm,
	tools=[get_read_file_tool()],
	can_handoff_to=["WikiAgent", "WebAgent", "StatsAgent", "AudioAgent", "ImageAgent"],
	)

	# TODO Wikipedia tool does not return the tables from the page...
	wiki_spec = CustomWikipediaToolSpec()
	wiki_search_tool = wiki_spec.to_tool_list()[0]

	wiki_agent = FunctionAgent(
	name="WikiAgent",
	description="Agent that can access Wikipedia to answer a question. Try using this agent if the WebAgent does not find an answer to a question.",
	system_prompt=(
	"You are a Wikipedia agent that can search Wikipedia for information and extract the relevant information to answer a question. "
	"You only give concise answers and if you don't find an answer to the given query on Wikipedia, "
	"you communicate this clearly. Always hand off your answer to MainAgent."
	),
	llm=llm,
	tools=[wiki_search_tool],
	can_handoff_to=["MainAgent"],
	)

	tool_spec = DuckDuckGoSearchToolSpec()
	search_tool = FunctionTool.from_defaults(tool_spec.duckduckgo_full_search)

	web_search_agent = FunctionAgent(
	name="WebAgent",
	description="Uses the web to answer a question.",
	system_prompt=(
	"You are a Web agent that can search the Web and extract the relevant information to answer a question. "
	"You only give concise answers and if you don't find an answer to the given query with your tool, "
	"you communicate this clearly. Always hand off your answer to MainAgent."
	),
	llm=llm,
	tools=[get_search_web_tool()],
	can_handoff_to=["MainAgent"],
	)

	audio_agent = FunctionAgent(
	name="AudioAgent",
	description="Uses transcription tools to analyze audio files. This agent needs a file id and an optional question as input",
	system_prompt=(
	"You are an audio agent that can transcribe an audio file identified by its id and answer questions about the transcript. "
	"You only give concise answers and if you cannot answer the given query using your tool, "
	"you communicate this clearly. Always hand off your answer to MainAgent."
	),
	llm=llm,
	tools=[get_transcription_tool()],
	can_handoff_to=["MainAgent"],
	)

	image_agent = FunctionAgent(
	name="ImageAgent",
	description="Can respond to questions involving image understanding. This agent needs a file id and a question as an input.",
	system_prompt=(
	"You are an agent that can read images from a file identified by its id and answer questions about it. "
	"Give concise answers and only include the relevant information in you response."
	"If you cannot answer the given query using your tool, you communicate this clearly. "
	"Always hand off your answer to MainAgent."
	),
	llm=llm,
	tools=[get_image_qa_tool()],
	can_handoff_to=["MainAgent"],
	)

	stats_agent = FunctionAgent(
	name="StatsAgent",
	description="Uses statistical tools to read and analyse excel and csv files. This agent needs a file id and an optional question as an input",
	system_prompt=(
	"You are an agent that can read excel and csv files and run simple statistical analysis on them. "
	"You can use this information or the loaded file to answer questions about it. "
	"You only give concise answers and if you cannot answer the given query using your tool, "
	"you communicate this clearly. Always hand off your answer to MainAgent."
	),
	llm=llm,
	tools=[get_csv_analysis_tool(), get_csv_tool(),
	get_excel_analysis_tool(), get_excel_tool()],
	can_handoff_to=["MainAgent"],
	)

	# Main AgentWorkflow
	self.agent = AgentWorkflow(
	agents=[main_agent, wiki_agent, web_search_agent,
	audio_agent, stats_agent], # Exclude image agent for now because we are using a multimodal model.
	root_agent=main_agent.name,
	)

	async def __call__(self, question: str, task_id: str = None, file_type: str = None) -> str:
	# Prepare Input
	file_str = ""
	img_block = None
	if file_type in ["png", "jpg", "jpeg"]:
	img_block = ImageBlock(url=_build_file_url(task_id))
	elif file_exists(task_id):
	file_str = f'\nIf you need to load a file, do so by providing the id "{task_id}".'

	final_answer = (
	"Remember to always use the template 'FINAL ANSWER: [YOUR FINAL ANSWER]' for your final output. "
	"Always use as few words as possible for your final answer."
	)

	msg = f"{question}{file_str}\n{final_answer}"
	if img_block:
	msg = ChatMessage(
	role=MessageRole.USER,
	blocks=[
	TextBlock(text=msg),
	img_block
	],
	)

	# Stream events
	handler = self.agent.run(user_msg=msg)

	current_agent = None
	current_tool_calls = ""
	async for event in handler.stream_events():
	if (
	hasattr(event, "current_agent_name")
	and event.current_agent_name != current_agent
	):
	current_agent = event.current_agent_name
	print(f"\n{'='*50}")
	print(f"🤖 Agent: {current_agent}")
	print(f"{'='*50}\n")
	elif isinstance(event, AgentOutput):
	if event.response.content:
	print("📤 Output:", event.response.content)
	if event.tool_calls:
	print(
	"🛠️ Planning to use tools:",
	[call.tool_name for call in event.tool_calls],
	)
	elif isinstance(event, ToolCallResult):
	print(f"🔧 Tool Result ({event.tool_name}):")
	print(f" Arguments: {event.tool_kwargs}")
	print(f" Output: {event.tool_output}")
	elif isinstance(event, ToolCall):
	print(f"🔨 Calling Tool: {event.tool_name}")
	print(f" With arguments: {event.tool_kwargs}")

	# Avoid ratelimits - 15 requests per minute
	await asyncio.sleep(4.1)

	if self.langfuse:
	self.instrumentor.flush()

	try:
	res = await handler
	res = res.response.content
	res = re.sub(r'^.*?FINAL ANSWER:', '', res, flags=re.DOTALL).strip()
	return res
	except:
	return "Error occured. No valid agent response could be determined."

	def file_exists(task_id: str) -> bool:
	try:
	file = _get_file(task_id)
	except:
	return False
	del file
	return True