Final_Assignment_Agent

Paused

App Files Files Community

Martin Bär commited on May 16

Commit

9f1ce17

1 Parent(s): ef24142

Use Gemini's Vision Capabilities directly instead of dedicated image tool

Browse files

Files changed (2) hide show

app.py +4 -1
basic_agent.py +19 -5

app.py CHANGED Viewed

@@ -64,13 +64,16 @@ async def run_and_submit_all( profile: gr.OAuthProfile | None):
     answers_payload = []
     print(f"Running agent on {len(questions_data)} questions...")
     for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
-            submitted_answer = await agent(question_text, task_id)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:

     answers_payload = []
     print(f"Running agent on {len(questions_data)} questions...")
     for item in questions_data:
+        file_type = item.get("file_name")
+        if file_type:
+            file_type = file_type.split(".")[-1]
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
+            submitted_answer = await agent(question_text, task_id, file_type=file_type)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:

basic_agent.py CHANGED Viewed

@@ -9,6 +9,7 @@ from llama_index.tools.wikipedia import WikipediaToolSpec
 from langfuse.llama_index import LlamaIndexInstrumentor
 from llama_index.llms.ollama import Ollama
 from llama_index.llms.google_genai import GoogleGenAI
 from llama_index.core.agent.workflow import FunctionAgent, AgentWorkflow
 from llama_index.core.agent.workflow import (
     AgentOutput,
@@ -17,7 +18,8 @@ from llama_index.core.agent.workflow import (
 )
 from multimodality_tools import get_image_qa_tool, get_transcription_tool, \
-    get_excel_analysis_tool, get_excel_tool, get_csv_analysis_tool, get_csv_tool, _get_file, get_read_file_tool
 from web_tools import get_search_web_tool
 from wiki_tool import CustomWikipediaToolSpec
@@ -140,13 +142,17 @@ class BasicAgent:
         # Main AgentWorkflow
         self.agent = AgentWorkflow(
             agents=[main_agent, wiki_agent, web_search_agent,
-                    audio_agent, image_agent, stats_agent],
             root_agent=main_agent.name,
         )
-    async def __call__(self, question: str, task_id: str = None) -> str:
         file_str = ""
-        if file_exists(task_id):
             file_str = f'\nIf you need to load a file, do so by providing the id "{task_id}".'
         final_answer = (
@@ -155,6 +161,14 @@ class BasicAgent:
         )
         msg = f"{question}{file_str}\n{final_answer}"
         # Stream events
         handler = self.agent.run(user_msg=msg)
@@ -206,4 +220,4 @@ def file_exists(task_id: str) -> bool:
     except:
         return False
     del file
-    return True

 from langfuse.llama_index import LlamaIndexInstrumentor
 from llama_index.llms.ollama import Ollama
 from llama_index.llms.google_genai import GoogleGenAI
+from llama_index.core.llms import ChatMessage, TextBlock, ImageBlock, MessageRole
 from llama_index.core.agent.workflow import FunctionAgent, AgentWorkflow
 from llama_index.core.agent.workflow import (
     AgentOutput,
 )
 from multimodality_tools import get_image_qa_tool, get_transcription_tool, \
+    get_excel_analysis_tool, get_excel_tool, get_csv_analysis_tool, get_csv_tool, _get_file, \
+    get_read_file_tool, _build_file_url
 from web_tools import get_search_web_tool
 from wiki_tool import CustomWikipediaToolSpec
         # Main AgentWorkflow
         self.agent = AgentWorkflow(
             agents=[main_agent, wiki_agent, web_search_agent,
+                    audio_agent, stats_agent], # Exclude image agent for now because we are using a multimodal model.
             root_agent=main_agent.name,
         )
+    async def __call__(self, question: str, task_id: str = None, file_type: str = None) -> str:
+        # Prepare Input
         file_str = ""
+        img_block = None
+        if file_type in ["png", "jpg", "jpeg"]:
+            img_block = ImageBlock(url=_build_file_url(task_id))
+        elif file_exists(task_id):
             file_str = f'\nIf you need to load a file, do so by providing the id "{task_id}".'
         final_answer = (
         )
         msg = f"{question}{file_str}\n{final_answer}"
+        if img_block:
+            msg = ChatMessage(
+                role=MessageRole.USER,
+                blocks=[
+                    TextBlock(text=msg),
+                    img_block
+                ],
+            )
         # Stream events
         handler = self.agent.run(user_msg=msg)
     except:
         return False
     del file
+    return True