Martin Bär
commited on
Commit
·
9f1ce17
1
Parent(s):
ef24142
Use Gemini's Vision Capabilities directly instead of dedicated image tool
Browse files- app.py +4 -1
- basic_agent.py +19 -5
app.py
CHANGED
|
@@ -64,13 +64,16 @@ async def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 64 |
answers_payload = []
|
| 65 |
print(f"Running agent on {len(questions_data)} questions...")
|
| 66 |
for item in questions_data:
|
|
|
|
|
|
|
|
|
|
| 67 |
task_id = item.get("task_id")
|
| 68 |
question_text = item.get("question")
|
| 69 |
if not task_id or question_text is None:
|
| 70 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 71 |
continue
|
| 72 |
try:
|
| 73 |
-
submitted_answer = await agent(question_text, task_id)
|
| 74 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 75 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 76 |
except Exception as e:
|
|
|
|
| 64 |
answers_payload = []
|
| 65 |
print(f"Running agent on {len(questions_data)} questions...")
|
| 66 |
for item in questions_data:
|
| 67 |
+
file_type = item.get("file_name")
|
| 68 |
+
if file_type:
|
| 69 |
+
file_type = file_type.split(".")[-1]
|
| 70 |
task_id = item.get("task_id")
|
| 71 |
question_text = item.get("question")
|
| 72 |
if not task_id or question_text is None:
|
| 73 |
print(f"Skipping item with missing task_id or question: {item}")
|
| 74 |
continue
|
| 75 |
try:
|
| 76 |
+
submitted_answer = await agent(question_text, task_id, file_type=file_type)
|
| 77 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
| 78 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 79 |
except Exception as e:
|
basic_agent.py
CHANGED
|
@@ -9,6 +9,7 @@ from llama_index.tools.wikipedia import WikipediaToolSpec
|
|
| 9 |
from langfuse.llama_index import LlamaIndexInstrumentor
|
| 10 |
from llama_index.llms.ollama import Ollama
|
| 11 |
from llama_index.llms.google_genai import GoogleGenAI
|
|
|
|
| 12 |
from llama_index.core.agent.workflow import FunctionAgent, AgentWorkflow
|
| 13 |
from llama_index.core.agent.workflow import (
|
| 14 |
AgentOutput,
|
|
@@ -17,7 +18,8 @@ from llama_index.core.agent.workflow import (
|
|
| 17 |
)
|
| 18 |
|
| 19 |
from multimodality_tools import get_image_qa_tool, get_transcription_tool, \
|
| 20 |
-
get_excel_analysis_tool, get_excel_tool, get_csv_analysis_tool, get_csv_tool, _get_file,
|
|
|
|
| 21 |
from web_tools import get_search_web_tool
|
| 22 |
from wiki_tool import CustomWikipediaToolSpec
|
| 23 |
|
|
@@ -140,13 +142,17 @@ class BasicAgent:
|
|
| 140 |
# Main AgentWorkflow
|
| 141 |
self.agent = AgentWorkflow(
|
| 142 |
agents=[main_agent, wiki_agent, web_search_agent,
|
| 143 |
-
audio_agent,
|
| 144 |
root_agent=main_agent.name,
|
| 145 |
)
|
| 146 |
|
| 147 |
-
async def __call__(self, question: str, task_id: str = None) -> str:
|
|
|
|
| 148 |
file_str = ""
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
| 150 |
file_str = f'\nIf you need to load a file, do so by providing the id "{task_id}".'
|
| 151 |
|
| 152 |
final_answer = (
|
|
@@ -155,6 +161,14 @@ class BasicAgent:
|
|
| 155 |
)
|
| 156 |
|
| 157 |
msg = f"{question}{file_str}\n{final_answer}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
# Stream events
|
| 160 |
handler = self.agent.run(user_msg=msg)
|
|
@@ -206,4 +220,4 @@ def file_exists(task_id: str) -> bool:
|
|
| 206 |
except:
|
| 207 |
return False
|
| 208 |
del file
|
| 209 |
-
return True
|
|
|
|
| 9 |
from langfuse.llama_index import LlamaIndexInstrumentor
|
| 10 |
from llama_index.llms.ollama import Ollama
|
| 11 |
from llama_index.llms.google_genai import GoogleGenAI
|
| 12 |
+
from llama_index.core.llms import ChatMessage, TextBlock, ImageBlock, MessageRole
|
| 13 |
from llama_index.core.agent.workflow import FunctionAgent, AgentWorkflow
|
| 14 |
from llama_index.core.agent.workflow import (
|
| 15 |
AgentOutput,
|
|
|
|
| 18 |
)
|
| 19 |
|
| 20 |
from multimodality_tools import get_image_qa_tool, get_transcription_tool, \
|
| 21 |
+
get_excel_analysis_tool, get_excel_tool, get_csv_analysis_tool, get_csv_tool, _get_file, \
|
| 22 |
+
get_read_file_tool, _build_file_url
|
| 23 |
from web_tools import get_search_web_tool
|
| 24 |
from wiki_tool import CustomWikipediaToolSpec
|
| 25 |
|
|
|
|
| 142 |
# Main AgentWorkflow
|
| 143 |
self.agent = AgentWorkflow(
|
| 144 |
agents=[main_agent, wiki_agent, web_search_agent,
|
| 145 |
+
audio_agent, stats_agent], # Exclude image agent for now because we are using a multimodal model.
|
| 146 |
root_agent=main_agent.name,
|
| 147 |
)
|
| 148 |
|
| 149 |
+
async def __call__(self, question: str, task_id: str = None, file_type: str = None) -> str:
|
| 150 |
+
# Prepare Input
|
| 151 |
file_str = ""
|
| 152 |
+
img_block = None
|
| 153 |
+
if file_type in ["png", "jpg", "jpeg"]:
|
| 154 |
+
img_block = ImageBlock(url=_build_file_url(task_id))
|
| 155 |
+
elif file_exists(task_id):
|
| 156 |
file_str = f'\nIf you need to load a file, do so by providing the id "{task_id}".'
|
| 157 |
|
| 158 |
final_answer = (
|
|
|
|
| 161 |
)
|
| 162 |
|
| 163 |
msg = f"{question}{file_str}\n{final_answer}"
|
| 164 |
+
if img_block:
|
| 165 |
+
msg = ChatMessage(
|
| 166 |
+
role=MessageRole.USER,
|
| 167 |
+
blocks=[
|
| 168 |
+
TextBlock(text=msg),
|
| 169 |
+
img_block
|
| 170 |
+
],
|
| 171 |
+
)
|
| 172 |
|
| 173 |
# Stream events
|
| 174 |
handler = self.agent.run(user_msg=msg)
|
|
|
|
| 220 |
except:
|
| 221 |
return False
|
| 222 |
del file
|
| 223 |
+
return True
|