Final_Assignment_Agent

Paused

App Files Files Community

Martin Bär commited on May 11

Commit

dbb14b6

1 Parent(s): 8ea3490

Change agent into WorkFlow with sub-agents and use Google Gemini

Browse files

Files changed (4) hide show

app.py +0 -7
basic_agent.py +150 -27
multimodality_tools.py +8 -1
requirements.txt +3 -3

app.py CHANGED Viewed

@@ -12,12 +12,6 @@ from basic_agent import BasicAgent
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-# For Llamaindex's LoadAndSearchTool
-Settings.llm = None # disable LLM for Index Retrieval
-Settings.chunk_size = 512 # Smaller chunk size for retrieval
-Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
@@ -139,7 +133,6 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         return status_message, results_df
 async def handle_agent_input(user_input):
-    # TODO initialize agent at a different place
     agent = BasicAgent()
     response = await agent(user_input)
     return response

 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
         return status_message, results_df
 async def handle_agent_input(user_input):
     agent = BasicAgent()
     response = await agent(user_input)
     return response

basic_agent.py CHANGED Viewed

@@ -1,10 +1,20 @@
 from llama_index.core.tools import FunctionTool
 from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
 from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
 from llama_index.tools.wikipedia import WikipediaToolSpec
 from langfuse.llama_index import LlamaIndexInstrumentor
 from llama_index.llms.ollama import Ollama
-from llama_index.core.agent.workflow import FunctionAgent, ReActAgent
 from multimodality_tools import get_image_qa_tool, get_transcription_tool, \
     get_excel_analysis_tool, get_excel_tool, get_csv_analysis_tool, get_csv_tool
@@ -12,7 +22,8 @@ from multimodality_tools import get_image_qa_tool, get_transcription_tool, \
 class BasicAgent:
     def __init__(self, ollama=False, langfuse=False):
         if not ollama:
-            llm = HuggingFaceInferenceAPI(model_name="Qwen/Qwen3-32B") #"Qwen/Qwen2.5-Coder-32B-Instruct")
         else:
             llm = Ollama(model="mistral:latest", request_timeout=120.0)
@@ -22,29 +33,15 @@ class BasicAgent:
             self.instrumentor = LlamaIndexInstrumentor()
             self.instrumentor.start()
-        # Initialize tools
-        tool_spec = DuckDuckGoSearchToolSpec()
-        search_tool = FunctionTool.from_defaults(tool_spec.duckduckgo_full_search)
-        # Convert into a LoadAndSearchToolSpec because the wikipedia search tool returns
-        # entire Wikipedia pages and this can pollute the context window of the LLM
-        wiki_spec = WikipediaToolSpec()
-        wiki_search_tool = wiki_spec.to_tool_list()[1]
-        # Convert into a LoadAndSearchToolSpec because the wikipedia search tool returns
-        # entire Wikipedia pages and this can pollute the context window of the LLM
-        # TODO this does not work so well. We need to make the retriever return the top 5 chunks or sth.
-        # wiki_search_tool_las = LoadAndSearchToolSpec.from_defaults(wiki_search_tool).to_tool_list()
-        self.agent = ReActAgent(
-            tools=[search_tool, wiki_search_tool, get_image_qa_tool(),
-                   get_transcription_tool(), get_excel_analysis_tool(), get_excel_tool(),
-                   get_csv_analysis_tool(), get_csv_tool()],
-            llm=llm,
-            verbose=True,
-            system_prompt = (
                 "You are a general AI assistant. I will ask you a question. "
-                "Report your thoughts, and finish your answer with the following template: "
                 "FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number "
                 "OR as few words as possible OR a comma separated list of numbers and/or "
                 "strings. If you are asked for a number, don't use comma to write your "
@@ -53,19 +50,145 @@ class BasicAgent:
                 "for cities), and write the digits in plain text unless specified otherwise. If "
                 "you are asked for a comma separated list, apply the above rules depending of "
                 "whether the element to be put in the list is a number or a string."
-            )
         )
-        # self.ctx = Context(self.agent)
     async def __call__(self, question: str, task_id: str = None) -> str:
         file_str = ""
         if task_id:
             file_str = f'\nIf you need to load a file, do so by providing the id "{task_id}".'
-        response = await self.agent.run(user_msg=question + file_str) # ctx=self.ctx)
         if self.langfuse:
             self.instrumentor.flush()
-        return response.response.content.replace("FINAL ANSWER:", "").strip()

+import os
+import re
+from tavily import AsyncTavilyClient
 from llama_index.core.tools import FunctionTool
 from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
 from llama_index.tools.duckduckgo import DuckDuckGoSearchToolSpec
 from llama_index.tools.wikipedia import WikipediaToolSpec
 from langfuse.llama_index import LlamaIndexInstrumentor
 from llama_index.llms.ollama import Ollama
+from llama_index.llms.google_genai import GoogleGenAI
+from llama_index.core.agent.workflow import FunctionAgent, AgentWorkflow
+from llama_index.core.agent.workflow import (
+    AgentOutput,
+    ToolCall,
+    ToolCallResult,
+)
 from multimodality_tools import get_image_qa_tool, get_transcription_tool, \
     get_excel_analysis_tool, get_excel_tool, get_csv_analysis_tool, get_csv_tool
 class BasicAgent:
     def __init__(self, ollama=False, langfuse=False):
         if not ollama:
+            llm = GoogleGenAI(model="gemini-2.0-flash", api_key=os.getenv("GEMINI_API_KEY"))
+            # llm = HuggingFaceInferenceAPI(model_name="Qwen/Qwen3-32B") #"Qwen/Qwen2.5-Coder-32B-Instruct")
         else:
             llm = Ollama(model="mistral:latest", request_timeout=120.0)
             self.instrumentor = LlamaIndexInstrumentor()
             self.instrumentor.start()
+        # Initialize sub-agents
+        main_agent = FunctionAgent(
+            name="MainAgent",
+            description="Can organize and delegate work to different agents and can compile a final answer to a question from other agents' outputs.",
+            system_prompt=(
                 "You are a general AI assistant. I will ask you a question. "
+                "Report your thoughts, delegate work to other agents if necessary, and"
+                "finish your answer with the following template: "
                 "FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number "
                 "OR as few words as possible OR a comma separated list of numbers and/or "
                 "strings. If you are asked for a number, don't use comma to write your "
                 "for cities), and write the digits in plain text unless specified otherwise. If "
                 "you are asked for a comma separated list, apply the above rules depending of "
                 "whether the element to be put in the list is a number or a string."
+            ),
+            llm=llm,
+            tools=[],
+            can_handoff_to=["WikiAgent", "WebAgent", "StatsAgent", "AudioAgent", "ImageAgent"],
         )
+        # Wikipedia tool does not return the tables from the page...
+        wiki_spec = WikipediaToolSpec()
+        wiki_search_tool = wiki_spec.to_tool_list()[1]
+        wiki_agent = FunctionAgent(
+            name="WikiAgent",
+            description="Uses wikipedia to answer a question.",
+            system_prompt=(
+                "You are a Wikipedia agent that can search Wikipedia for information to answer a question. "
+                "You only give concise answers and if you don't find an answer to the given query on Wikipedia, "
+                "you communicate this clearly. Always hand off your answer to MainAgent."
+            ),
+            llm=llm,
+            tools=[wiki_search_tool],
+            can_handoff_to=["MainAgent"],
+        )
+        tool_spec = DuckDuckGoSearchToolSpec()
+        search_tool = FunctionTool.from_defaults(tool_spec.duckduckgo_full_search)
+        # In case DuckDuckGo is not good enough
+        async def search_web(query: str) -> str:
+            """Searches the web to answer questions."""
+            client = AsyncTavilyClient(api_key=os.getenv("TAVILY"))
+            return str(await client.search(query))
+        web_search_agent = FunctionAgent(
+            name="WebAgent",
+            description="Uses the web to answer a question.",
+            system_prompt=(
+                "You are a Web agent that can search the Web for information to answer a question. "
+                "You only give concise answers and if you don't find an answer to the given query with your tool, "
+                "you communicate this clearly. Always hand off your answer to MainAgent."
+            ),
+            llm=llm,
+            tools=[search_web],
+            can_handoff_to=["MainAgent"],
+        )
+        audio_agent = FunctionAgent(
+            name="AudioAgent",
+            description="Uses transcription tools to analyze audio files.",
+            system_prompt=(
+                "You are an audio agent that can transcribe an audio file identified by its id and answer questions about it. "
+                "You only give concise answers and if you cannot answer the given query using your tool, "
+                "you communicate this clearly. Always hand off your answer to MainAgent."
+            ),
+            llm=llm,
+            tools=[get_transcription_tool()],
+            can_handoff_to=["MainAgent"],
+        )
+        image_agent = FunctionAgent(
+            name="ImageAgent",
+            description="Uses image analysis tools to analyze images and respond to questions.",
+            system_prompt=(
+                "You are an agent that can read images from a file identified by its id and answer questions about it. "
+                "You only give concise answers and if you cannot answer the given query using your tool, "
+                "you communicate this clearly. Always hand off your answer to MainAgent."
+            ),
+            llm=llm,
+            tools=[get_image_qa_tool()],
+            can_handoff_to=["MainAgent"],
+        )
+        stats_agent = FunctionAgent(
+            name="ImageAgent",
+            description="Uses statistical tools to read and analyse excel and csv files.",
+            system_prompt=(
+                "You are an agent that can read excel and csv files and run simple statistical analysis on them. "
+                "You can use this information or the loaded file to answer questions about it. "
+                "You only give concise answers and if you cannot answer the given query using your tool, "
+                "you communicate this clearly. Always hand off your answer to MainAgent."
+            ),
+            llm=llm,
+            tools=[get_csv_analysis_tool(), get_csv_tool(),
+                   get_excel_analysis_tool(), get_excel_tool()],
+            can_handoff_to=["MainAgent"],
+        )
+        # Main AgentWorkflow
+        self.agent = AgentWorkflow(
+            agents=[main_agent, wiki_agent, web_search_agent,
+                    audio_agent, image_agent, stats_agent],
+            root_agent=main_agent.name,
+        )
     async def __call__(self, question: str, task_id: str = None) -> str:
         file_str = ""
         if task_id:
             file_str = f'\nIf you need to load a file, do so by providing the id "{task_id}".'
+        msg = f"{question}{file_str}"
+        # Stream events
+        handler = self.agent.run(user_msg=msg)
+        current_agent = None
+        current_tool_calls = ""
+        async for event in handler.stream_events():
+            if (
+                hasattr(event, "current_agent_name")
+                and event.current_agent_name != current_agent
+            ):
+                current_agent = event.current_agent_name
+                print(f"\n{'='*50}")
+                print(f"🤖 Agent: {current_agent}")
+                print(f"{'='*50}\n")
+            # if isinstance(event, AgentStream):
+            #     if event.delta:
+            #         print(event.delta, end="", flush=True)
+            # elif isinstance(event, AgentInput):
+            #     print("📥 Input:", event.input)
+            elif isinstance(event, AgentOutput):
+                if event.response.content:
+                    print("📤 Output:", event.response.content)
+                if event.tool_calls:
+                    print(
+                        "🛠️  Planning to use tools:",
+                        [call.tool_name for call in event.tool_calls],
+                    )
+            elif isinstance(event, ToolCallResult):
+                print(f"🔧 Tool Result ({event.tool_name}):")
+                print(f"  Arguments: {event.tool_kwargs}")
+                print(f"  Output: {event.tool_output}")
+            elif isinstance(event, ToolCall):
+                print(f"🔨 Calling Tool: {event.tool_name}")
+                print(f"  With arguments: {event.tool_kwargs}")
         if self.langfuse:
             self.instrumentor.flush()
+        res = await handler
+        res = res.response.content.strip()
+        res = re.sub(r'^.*?FINAL ANSWER:', 'FINAL ANSWER:', res, flags=re.DOTALL)
+        return res

multimodality_tools.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import os
 import io
 import requests
 import librosa
@@ -86,7 +87,7 @@ def answer_image_question(question: str, file_id: str) -> str:
         max_tokens=512,
     )
-    return completion.choices[0].message.content
 def get_image_qa_tool():
     return FunctionTool.from_defaults(
@@ -153,3 +154,9 @@ def _get_file(task_id: str) -> io.BytesIO:
         raise FileNotFoundError("Invalid file or task id.")
     file_like = io.BytesIO(res.content)
     return file_like

 import os
 import io
+import re
 import requests
 import librosa
         max_tokens=512,
     )
+    return remove_think(completion.choices[0].message.content)
 def get_image_qa_tool():
     return FunctionTool.from_defaults(
         raise FileNotFoundError("Invalid file or task id.")
     file_like = io.BytesIO(res.content)
     return file_like
+def remove_think(output: str) -> str:
+    """Removes the <think> part of an LLM output."""
+    if output:
+        return re.sub("<think>.*</think>", "", output).strip()
+    return output

requirements.txt CHANGED Viewed

@@ -4,9 +4,8 @@ llama-index
 llama-index-llms-huggingface-api
 llama_index-tools-duckduckgo
 llama_index-tools-wikipedia
-llama-index-embeddings-huggingface
-llama-index-readers-web
 llama-index-llms-ollama
 langfuse
 tabulate
 soundfile
@@ -14,4 +13,5 @@ librosa
 pillow
 pandas
 huggingface_hub
-transformers

 llama-index-llms-huggingface-api
 llama_index-tools-duckduckgo
 llama_index-tools-wikipedia
 llama-index-llms-ollama
+llama-index-llms-google-genai
 langfuse
 tabulate
 soundfile
 pillow
 pandas
 huggingface_hub
+transformers
+tavily-python