Martin Bär commited on
Commit
9f1ce17
·
1 Parent(s): ef24142

Use Gemini's Vision Capabilities directly instead of dedicated image tool

Browse files
Files changed (2) hide show
  1. app.py +4 -1
  2. basic_agent.py +19 -5
app.py CHANGED
@@ -64,13 +64,16 @@ async def run_and_submit_all( profile: gr.OAuthProfile | None):
64
  answers_payload = []
65
  print(f"Running agent on {len(questions_data)} questions...")
66
  for item in questions_data:
 
 
 
67
  task_id = item.get("task_id")
68
  question_text = item.get("question")
69
  if not task_id or question_text is None:
70
  print(f"Skipping item with missing task_id or question: {item}")
71
  continue
72
  try:
73
- submitted_answer = await agent(question_text, task_id)
74
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
75
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
76
  except Exception as e:
 
64
  answers_payload = []
65
  print(f"Running agent on {len(questions_data)} questions...")
66
  for item in questions_data:
67
+ file_type = item.get("file_name")
68
+ if file_type:
69
+ file_type = file_type.split(".")[-1]
70
  task_id = item.get("task_id")
71
  question_text = item.get("question")
72
  if not task_id or question_text is None:
73
  print(f"Skipping item with missing task_id or question: {item}")
74
  continue
75
  try:
76
+ submitted_answer = await agent(question_text, task_id, file_type=file_type)
77
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
78
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
79
  except Exception as e:
basic_agent.py CHANGED
@@ -9,6 +9,7 @@ from llama_index.tools.wikipedia import WikipediaToolSpec
9
  from langfuse.llama_index import LlamaIndexInstrumentor
10
  from llama_index.llms.ollama import Ollama
11
  from llama_index.llms.google_genai import GoogleGenAI
 
12
  from llama_index.core.agent.workflow import FunctionAgent, AgentWorkflow
13
  from llama_index.core.agent.workflow import (
14
  AgentOutput,
@@ -17,7 +18,8 @@ from llama_index.core.agent.workflow import (
17
  )
18
 
19
  from multimodality_tools import get_image_qa_tool, get_transcription_tool, \
20
- get_excel_analysis_tool, get_excel_tool, get_csv_analysis_tool, get_csv_tool, _get_file, get_read_file_tool
 
21
  from web_tools import get_search_web_tool
22
  from wiki_tool import CustomWikipediaToolSpec
23
 
@@ -140,13 +142,17 @@ class BasicAgent:
140
  # Main AgentWorkflow
141
  self.agent = AgentWorkflow(
142
  agents=[main_agent, wiki_agent, web_search_agent,
143
- audio_agent, image_agent, stats_agent],
144
  root_agent=main_agent.name,
145
  )
146
 
147
- async def __call__(self, question: str, task_id: str = None) -> str:
 
148
  file_str = ""
149
- if file_exists(task_id):
 
 
 
150
  file_str = f'\nIf you need to load a file, do so by providing the id "{task_id}".'
151
 
152
  final_answer = (
@@ -155,6 +161,14 @@ class BasicAgent:
155
  )
156
 
157
  msg = f"{question}{file_str}\n{final_answer}"
 
 
 
 
 
 
 
 
158
 
159
  # Stream events
160
  handler = self.agent.run(user_msg=msg)
@@ -206,4 +220,4 @@ def file_exists(task_id: str) -> bool:
206
  except:
207
  return False
208
  del file
209
- return True
 
9
  from langfuse.llama_index import LlamaIndexInstrumentor
10
  from llama_index.llms.ollama import Ollama
11
  from llama_index.llms.google_genai import GoogleGenAI
12
+ from llama_index.core.llms import ChatMessage, TextBlock, ImageBlock, MessageRole
13
  from llama_index.core.agent.workflow import FunctionAgent, AgentWorkflow
14
  from llama_index.core.agent.workflow import (
15
  AgentOutput,
 
18
  )
19
 
20
  from multimodality_tools import get_image_qa_tool, get_transcription_tool, \
21
+ get_excel_analysis_tool, get_excel_tool, get_csv_analysis_tool, get_csv_tool, _get_file, \
22
+ get_read_file_tool, _build_file_url
23
  from web_tools import get_search_web_tool
24
  from wiki_tool import CustomWikipediaToolSpec
25
 
 
142
  # Main AgentWorkflow
143
  self.agent = AgentWorkflow(
144
  agents=[main_agent, wiki_agent, web_search_agent,
145
+ audio_agent, stats_agent], # Exclude image agent for now because we are using a multimodal model.
146
  root_agent=main_agent.name,
147
  )
148
 
149
+ async def __call__(self, question: str, task_id: str = None, file_type: str = None) -> str:
150
+ # Prepare Input
151
  file_str = ""
152
+ img_block = None
153
+ if file_type in ["png", "jpg", "jpeg"]:
154
+ img_block = ImageBlock(url=_build_file_url(task_id))
155
+ elif file_exists(task_id):
156
  file_str = f'\nIf you need to load a file, do so by providing the id "{task_id}".'
157
 
158
  final_answer = (
 
161
  )
162
 
163
  msg = f"{question}{file_str}\n{final_answer}"
164
+ if img_block:
165
+ msg = ChatMessage(
166
+ role=MessageRole.USER,
167
+ blocks=[
168
+ TextBlock(text=msg),
169
+ img_block
170
+ ],
171
+ )
172
 
173
  # Stream events
174
  handler = self.agent.run(user_msg=msg)
 
220
  except:
221
  return False
222
  del file
223
+ return True