Dkapsis commited on
Commit
64c3879
·
1 Parent(s): 9ac9d5e

image analysis agent

Browse files
__pycache__/agents.cpython-310.pyc CHANGED
Binary files a/__pycache__/agents.cpython-310.pyc and b/__pycache__/agents.cpython-310.pyc differ
 
__pycache__/prompts.cpython-310.pyc CHANGED
Binary files a/__pycache__/prompts.cpython-310.pyc and b/__pycache__/prompts.cpython-310.pyc differ
 
__pycache__/tools.cpython-310.pyc CHANGED
Binary files a/__pycache__/tools.cpython-310.pyc and b/__pycache__/tools.cpython-310.pyc differ
 
agents.py CHANGED
@@ -8,7 +8,7 @@ MANAGER_MODEL = "deepseek-ai/DeepSeek-R1"
8
  AGENT_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
9
  FINAL_ANSWER_MODEL = "deepseek-ai/DeepSeek-R1" # OpenAIServerModel
10
  WEB_SEARCH_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
11
- IMAGE_ANALYSIS_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
12
  AUDIO_ANALYSIS_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
13
  VIDEO_ANALYSIS_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
14
  YOUTUBE_ANALYSIS_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
@@ -37,7 +37,19 @@ def create_simple_web_search_agent(message):
37
  tools=[tools.simple_web_search_tool, tools.visit_web_page_tool],
38
  )
39
 
 
 
 
 
 
 
 
 
 
40
  def create_manager_agent(message):
 
 
 
41
  return CodeAgent(
42
  name="manager_agent",
43
  model=InferenceClientModel(MANAGER_MODEL, provider="together", max_tokens=8096),
@@ -45,7 +57,10 @@ def create_manager_agent(message):
45
  tools=[],
46
  planning_interval=4,
47
  verbosity_level=2,
48
- managed_agents=[simple_web_search_agent],
 
 
 
49
  max_steps=10,
50
  additional_authorized_imports=[
51
  "requests",
@@ -80,6 +95,6 @@ def create_final_answer_agent(message):
80
  name="final_answer_agent",
81
  description="Given a question and an initial answer, return the final refined answer following strict formatting rules.",
82
  model=InferenceClientModel(FINAL_ANSWER_MODEL),
83
- max_steps=1,
84
  tools=[],
85
  )
 
8
  AGENT_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
9
  FINAL_ANSWER_MODEL = "deepseek-ai/DeepSeek-R1" # OpenAIServerModel
10
  WEB_SEARCH_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
11
+ IMAGE_ANALYSIS_MODEL = "HuggingFaceM4/idefics2-8b"
12
  AUDIO_ANALYSIS_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
13
  VIDEO_ANALYSIS_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
14
  YOUTUBE_ANALYSIS_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
 
37
  tools=[tools.simple_web_search_tool, tools.visit_web_page_tool],
38
  )
39
 
40
+ def create_image_analysis_agent(message):
41
+ return CodeAgent(
42
+ name="image_analysis_agent",
43
+ description=prompts.get_image_analysis_prompt(message),
44
+ model=InferenceClientModel(IMAGE_ANALYSIS_MODEL),
45
+ tools=[image_analysis_tool],
46
+ max_steps=2,
47
+ )
48
+
49
  def create_manager_agent(message):
50
+ simple_web_search_agent = create_simple_web_search_agent(message)
51
+ image_analysis_agent = create_image_analysis_agent(message)
52
+
53
  return CodeAgent(
54
  name="manager_agent",
55
  model=InferenceClientModel(MANAGER_MODEL, provider="together", max_tokens=8096),
 
57
  tools=[],
58
  planning_interval=4,
59
  verbosity_level=2,
60
+ managed_agents=[
61
+ simple_web_search_agent,
62
+ image_analysis_agent,
63
+ ],
64
  max_steps=10,
65
  additional_authorized_imports=[
66
  "requests",
 
95
  name="final_answer_agent",
96
  description="Given a question and an initial answer, return the final refined answer following strict formatting rules.",
97
  model=InferenceClientModel(FINAL_ANSWER_MODEL),
98
+ max_steps=2,
99
  tools=[],
100
  )
prompts.py CHANGED
@@ -7,13 +7,20 @@ def get_web_search_prompt(message, file_path=None):
7
 
8
  return prompt
9
 
 
 
 
 
 
 
 
10
  def get_manager_prompt(message, file_path=None):
11
  prompt = f"""Your job is to answer the following question.
12
  Answer the following question. If needed, delegate to one of your coworkers:\n
13
 
14
- - Web Search Agent: Use when the question requires current information. Web Search Agent requires a question only.\n
15
- Format the prompt like:
16
- "You are an expert web search assistant. Your task is to search the web and provide accurate answers to the following question: [INSERT QUESTION]"
17
 
18
  ...
19
 
 
7
 
8
  return prompt
9
 
10
+ def get_image_analysis_prompt(message, file_path=None):
11
+ prompt = f"""
12
+ As an expert image analysis assistant, you analyze the image to answer the question. Given a question and image file, analyze the image and answer the question: {message}
13
+ """
14
+
15
+ return prompt
16
+
17
  def get_manager_prompt(message, file_path=None):
18
  prompt = f"""Your job is to answer the following question.
19
  Answer the following question. If needed, delegate to one of your coworkers:\n
20
 
21
+ - Web Search Agent: requires a question only.\n
22
+
23
+ - Image Analysis Agent: requires a question and **.png, .jpeg, .webp, .heic, or .heif image file**.\n"
24
 
25
  ...
26
 
tools.py CHANGED
@@ -1,4 +1,6 @@
1
 
 
 
2
  from smolagents import DuckDuckGoSearchTool, VisitWebpageTool
3
  from smolagents.tools import tool
4
 
@@ -31,4 +33,37 @@ def web_search_tool(query: str) -> str:
31
  else:
32
  return "No relevant information found via DuckDuckGo."
33
  except Exception as e:
34
- raise RuntimeError(f"DuckDuckGo search failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
+ import os
3
+ import base64
4
  from smolagents import DuckDuckGoSearchTool, VisitWebpageTool
5
  from smolagents.tools import tool
6
 
 
33
  else:
34
  return "No relevant information found via DuckDuckGo."
35
  except Exception as e:
36
+ raise RuntimeError(f"DuckDuckGo search failed: {str(e)}")
37
+
38
+ @tool
39
+ def image_analysis_tool(question: str, file_path: str) -> str:
40
+ """
41
+ Given a question and an image file path, analyze the image to answer the question.
42
+
43
+ Args:
44
+ question (str): A question about the image.
45
+ file_path (str): Path to the image file.
46
+
47
+ Returns:
48
+ str: Answer to the question.
49
+
50
+ Raises:
51
+ RuntimeError: If processing fails.
52
+ """
53
+ try:
54
+ # Read and encode image to base64
55
+ with open(file_path, "rb") as img_file:
56
+ img_data = base64.b64encode(img_file.read()).decode("utf-8")
57
+
58
+ # Format the content in a typical vision+text prompt format
59
+ prompt = {
60
+ "inputs": {
61
+ "image": img_data,
62
+ "question": question
63
+ }
64
+ }
65
+
66
+ # You can return this dictionary directly if your model expects JSON format
67
+ return prompt # Actual agent model will process this
68
+ except Exception as e:
69
+ raise RuntimeError(f"Image analysis failed: {str(e)}")