gaia_final_assignment

Sleeping

App Files Files Community

Dkapsis commited on May 20

Commit

64c3879

1 Parent(s): 9ac9d5e

image analysis agent

Browse files

Files changed (6) hide show

__pycache__/agents.cpython-310.pyc +0 -0
__pycache__/prompts.cpython-310.pyc +0 -0
__pycache__/tools.cpython-310.pyc +0 -0
agents.py +18 -3
prompts.py +10 -3
tools.py +36 -1

__pycache__/agents.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/agents.cpython-310.pyc and b/__pycache__/agents.cpython-310.pyc differ

__pycache__/prompts.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/prompts.cpython-310.pyc and b/__pycache__/prompts.cpython-310.pyc differ

__pycache__/tools.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/tools.cpython-310.pyc and b/__pycache__/tools.cpython-310.pyc differ

agents.py CHANGED Viewed

@@ -8,7 +8,7 @@ MANAGER_MODEL = "deepseek-ai/DeepSeek-R1"
 AGENT_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
 FINAL_ANSWER_MODEL = "deepseek-ai/DeepSeek-R1" # OpenAIServerModel
 WEB_SEARCH_MODEL        = "Qwen/Qwen2.5-Coder-32B-Instruct"
-IMAGE_ANALYSIS_MODEL    = "Qwen/Qwen2.5-Coder-32B-Instruct"
 AUDIO_ANALYSIS_MODEL    = "Qwen/Qwen2.5-Coder-32B-Instruct"
 VIDEO_ANALYSIS_MODEL    = "Qwen/Qwen2.5-Coder-32B-Instruct"
 YOUTUBE_ANALYSIS_MODEL  = "Qwen/Qwen2.5-Coder-32B-Instruct"
@@ -37,7 +37,19 @@ def create_simple_web_search_agent(message):
         tools=[tools.simple_web_search_tool, tools.visit_web_page_tool],
     )
 def create_manager_agent(message):
     return CodeAgent(
         name="manager_agent",
         model=InferenceClientModel(MANAGER_MODEL, provider="together", max_tokens=8096),
@@ -45,7 +57,10 @@ def create_manager_agent(message):
         tools=[],
         planning_interval=4,
         verbosity_level=2,
-        managed_agents=[simple_web_search_agent],
         max_steps=10,
         additional_authorized_imports=[
             "requests",
@@ -80,6 +95,6 @@ def create_final_answer_agent(message):
         name="final_answer_agent",
         description="Given a question and an initial answer, return the final refined answer following strict formatting rules.",
         model=InferenceClientModel(FINAL_ANSWER_MODEL),
-        max_steps=1,
         tools=[],
     )

 AGENT_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
 FINAL_ANSWER_MODEL = "deepseek-ai/DeepSeek-R1" # OpenAIServerModel
 WEB_SEARCH_MODEL        = "Qwen/Qwen2.5-Coder-32B-Instruct"
+IMAGE_ANALYSIS_MODEL    = "HuggingFaceM4/idefics2-8b"
 AUDIO_ANALYSIS_MODEL    = "Qwen/Qwen2.5-Coder-32B-Instruct"
 VIDEO_ANALYSIS_MODEL    = "Qwen/Qwen2.5-Coder-32B-Instruct"
 YOUTUBE_ANALYSIS_MODEL  = "Qwen/Qwen2.5-Coder-32B-Instruct"
         tools=[tools.simple_web_search_tool, tools.visit_web_page_tool],
     )
+def create_image_analysis_agent(message):
+    return CodeAgent(
+        name="image_analysis_agent",
+        description=prompts.get_image_analysis_prompt(message),
+        model=InferenceClientModel(IMAGE_ANALYSIS_MODEL),
+        tools=[image_analysis_tool],
+        max_steps=2,
+    )
 def create_manager_agent(message):
+    simple_web_search_agent = create_simple_web_search_agent(message)
+    image_analysis_agent = create_image_analysis_agent(message)
     return CodeAgent(
         name="manager_agent",
         model=InferenceClientModel(MANAGER_MODEL, provider="together", max_tokens=8096),
         tools=[],
         planning_interval=4,
         verbosity_level=2,
+        managed_agents=[
+            simple_web_search_agent,
+            image_analysis_agent,
+        ],
         max_steps=10,
         additional_authorized_imports=[
             "requests",
         name="final_answer_agent",
         description="Given a question and an initial answer, return the final refined answer following strict formatting rules.",
         model=InferenceClientModel(FINAL_ANSWER_MODEL),
+        max_steps=2,
         tools=[],
     )

prompts.py CHANGED Viewed

@@ -7,13 +7,20 @@ def get_web_search_prompt(message, file_path=None):
     return prompt
 def get_manager_prompt(message, file_path=None):
     prompt = f"""Your job is to answer the following question.
         Answer the following question. If needed, delegate to one of your coworkers:\n
-        - Web Search Agent: Use when the question requires current information. Web Search Agent requires a question only.\n
-        Format the prompt like:
-        "You are an expert web search assistant. Your task is to search the web and provide accurate answers to the following question: [INSERT QUESTION]"
         ...

     return prompt
+def get_image_analysis_prompt(message, file_path=None):
+    prompt = f"""
+    As an expert image analysis assistant, you analyze the image to answer the question. Given a question and image file, analyze the image and answer the question: {message}
+        """
+    return prompt
 def get_manager_prompt(message, file_path=None):
     prompt = f"""Your job is to answer the following question.
         Answer the following question. If needed, delegate to one of your coworkers:\n
+        - Web Search Agent: requires a question only.\n
+        - Image Analysis Agent: requires a question and **.png, .jpeg, .webp, .heic, or .heif image file**.\n"
         ...

tools.py CHANGED Viewed

@@ -1,4 +1,6 @@
 from smolagents import DuckDuckGoSearchTool, VisitWebpageTool
 from smolagents.tools import tool
@@ -31,4 +33,37 @@ def web_search_tool(query: str) -> str:
         else:
             return "No relevant information found via DuckDuckGo."
     except Exception as e:
-        raise RuntimeError(f"DuckDuckGo search failed: {str(e)}")

+import os
+import base64
 from smolagents import DuckDuckGoSearchTool, VisitWebpageTool
 from smolagents.tools import tool
         else:
             return "No relevant information found via DuckDuckGo."
     except Exception as e:
+        raise RuntimeError(f"DuckDuckGo search failed: {str(e)}")
+@tool
+def image_analysis_tool(question: str, file_path: str) -> str:
+    """
+    Given a question and an image file path, analyze the image to answer the question.
+    Args:
+        question (str): A question about the image.
+        file_path (str): Path to the image file.
+    Returns:
+        str: Answer to the question.
+    Raises:
+        RuntimeError: If processing fails.
+    """
+    try:
+        # Read and encode image to base64
+        with open(file_path, "rb") as img_file:
+            img_data = base64.b64encode(img_file.read()).decode("utf-8")
+        # Format the content in a typical vision+text prompt format
+        prompt = {
+            "inputs": {
+                "image": img_data,
+                "question": question
+            }
+        }
+        # You can return this dictionary directly if your model expects JSON format
+        return prompt  # Actual agent model will process this
+    except Exception as e:
+        raise RuntimeError(f"Image analysis failed: {str(e)}")