Spaces:
Sleeping
Sleeping
image analysis agent
Browse files- __pycache__/agents.cpython-310.pyc +0 -0
- __pycache__/prompts.cpython-310.pyc +0 -0
- __pycache__/tools.cpython-310.pyc +0 -0
- agents.py +18 -3
- prompts.py +10 -3
- tools.py +36 -1
__pycache__/agents.cpython-310.pyc
CHANGED
Binary files a/__pycache__/agents.cpython-310.pyc and b/__pycache__/agents.cpython-310.pyc differ
|
|
__pycache__/prompts.cpython-310.pyc
CHANGED
Binary files a/__pycache__/prompts.cpython-310.pyc and b/__pycache__/prompts.cpython-310.pyc differ
|
|
__pycache__/tools.cpython-310.pyc
CHANGED
Binary files a/__pycache__/tools.cpython-310.pyc and b/__pycache__/tools.cpython-310.pyc differ
|
|
agents.py
CHANGED
@@ -8,7 +8,7 @@ MANAGER_MODEL = "deepseek-ai/DeepSeek-R1"
|
|
8 |
AGENT_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
|
9 |
FINAL_ANSWER_MODEL = "deepseek-ai/DeepSeek-R1" # OpenAIServerModel
|
10 |
WEB_SEARCH_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
|
11 |
-
IMAGE_ANALYSIS_MODEL = "
|
12 |
AUDIO_ANALYSIS_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
|
13 |
VIDEO_ANALYSIS_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
|
14 |
YOUTUBE_ANALYSIS_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
|
@@ -37,7 +37,19 @@ def create_simple_web_search_agent(message):
|
|
37 |
tools=[tools.simple_web_search_tool, tools.visit_web_page_tool],
|
38 |
)
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
def create_manager_agent(message):
|
|
|
|
|
|
|
41 |
return CodeAgent(
|
42 |
name="manager_agent",
|
43 |
model=InferenceClientModel(MANAGER_MODEL, provider="together", max_tokens=8096),
|
@@ -45,7 +57,10 @@ def create_manager_agent(message):
|
|
45 |
tools=[],
|
46 |
planning_interval=4,
|
47 |
verbosity_level=2,
|
48 |
-
managed_agents=[
|
|
|
|
|
|
|
49 |
max_steps=10,
|
50 |
additional_authorized_imports=[
|
51 |
"requests",
|
@@ -80,6 +95,6 @@ def create_final_answer_agent(message):
|
|
80 |
name="final_answer_agent",
|
81 |
description="Given a question and an initial answer, return the final refined answer following strict formatting rules.",
|
82 |
model=InferenceClientModel(FINAL_ANSWER_MODEL),
|
83 |
-
max_steps=
|
84 |
tools=[],
|
85 |
)
|
|
|
8 |
AGENT_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
|
9 |
FINAL_ANSWER_MODEL = "deepseek-ai/DeepSeek-R1" # OpenAIServerModel
|
10 |
WEB_SEARCH_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
|
11 |
+
IMAGE_ANALYSIS_MODEL = "HuggingFaceM4/idefics2-8b"
|
12 |
AUDIO_ANALYSIS_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
|
13 |
VIDEO_ANALYSIS_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
|
14 |
YOUTUBE_ANALYSIS_MODEL = "Qwen/Qwen2.5-Coder-32B-Instruct"
|
|
|
37 |
tools=[tools.simple_web_search_tool, tools.visit_web_page_tool],
|
38 |
)
|
39 |
|
40 |
+
def create_image_analysis_agent(message):
|
41 |
+
return CodeAgent(
|
42 |
+
name="image_analysis_agent",
|
43 |
+
description=prompts.get_image_analysis_prompt(message),
|
44 |
+
model=InferenceClientModel(IMAGE_ANALYSIS_MODEL),
|
45 |
+
tools=[image_analysis_tool],
|
46 |
+
max_steps=2,
|
47 |
+
)
|
48 |
+
|
49 |
def create_manager_agent(message):
|
50 |
+
simple_web_search_agent = create_simple_web_search_agent(message)
|
51 |
+
image_analysis_agent = create_image_analysis_agent(message)
|
52 |
+
|
53 |
return CodeAgent(
|
54 |
name="manager_agent",
|
55 |
model=InferenceClientModel(MANAGER_MODEL, provider="together", max_tokens=8096),
|
|
|
57 |
tools=[],
|
58 |
planning_interval=4,
|
59 |
verbosity_level=2,
|
60 |
+
managed_agents=[
|
61 |
+
simple_web_search_agent,
|
62 |
+
image_analysis_agent,
|
63 |
+
],
|
64 |
max_steps=10,
|
65 |
additional_authorized_imports=[
|
66 |
"requests",
|
|
|
95 |
name="final_answer_agent",
|
96 |
description="Given a question and an initial answer, return the final refined answer following strict formatting rules.",
|
97 |
model=InferenceClientModel(FINAL_ANSWER_MODEL),
|
98 |
+
max_steps=2,
|
99 |
tools=[],
|
100 |
)
|
prompts.py
CHANGED
@@ -7,13 +7,20 @@ def get_web_search_prompt(message, file_path=None):
|
|
7 |
|
8 |
return prompt
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
def get_manager_prompt(message, file_path=None):
|
11 |
prompt = f"""Your job is to answer the following question.
|
12 |
Answer the following question. If needed, delegate to one of your coworkers:\n
|
13 |
|
14 |
-
- Web Search Agent:
|
15 |
-
|
16 |
-
|
17 |
|
18 |
...
|
19 |
|
|
|
7 |
|
8 |
return prompt
|
9 |
|
10 |
+
def get_image_analysis_prompt(message, file_path=None):
|
11 |
+
prompt = f"""
|
12 |
+
As an expert image analysis assistant, you analyze the image to answer the question. Given a question and image file, analyze the image and answer the question: {message}
|
13 |
+
"""
|
14 |
+
|
15 |
+
return prompt
|
16 |
+
|
17 |
def get_manager_prompt(message, file_path=None):
|
18 |
prompt = f"""Your job is to answer the following question.
|
19 |
Answer the following question. If needed, delegate to one of your coworkers:\n
|
20 |
|
21 |
+
- Web Search Agent: requires a question only.\n
|
22 |
+
|
23 |
+
- Image Analysis Agent: requires a question and **.png, .jpeg, .webp, .heic, or .heif image file**.\n"
|
24 |
|
25 |
...
|
26 |
|
tools.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
|
|
|
|
|
2 |
from smolagents import DuckDuckGoSearchTool, VisitWebpageTool
|
3 |
from smolagents.tools import tool
|
4 |
|
@@ -31,4 +33,37 @@ def web_search_tool(query: str) -> str:
|
|
31 |
else:
|
32 |
return "No relevant information found via DuckDuckGo."
|
33 |
except Exception as e:
|
34 |
-
raise RuntimeError(f"DuckDuckGo search failed: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
+
import os
|
3 |
+
import base64
|
4 |
from smolagents import DuckDuckGoSearchTool, VisitWebpageTool
|
5 |
from smolagents.tools import tool
|
6 |
|
|
|
33 |
else:
|
34 |
return "No relevant information found via DuckDuckGo."
|
35 |
except Exception as e:
|
36 |
+
raise RuntimeError(f"DuckDuckGo search failed: {str(e)}")
|
37 |
+
|
38 |
+
@tool
|
39 |
+
def image_analysis_tool(question: str, file_path: str) -> str:
|
40 |
+
"""
|
41 |
+
Given a question and an image file path, analyze the image to answer the question.
|
42 |
+
|
43 |
+
Args:
|
44 |
+
question (str): A question about the image.
|
45 |
+
file_path (str): Path to the image file.
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
str: Answer to the question.
|
49 |
+
|
50 |
+
Raises:
|
51 |
+
RuntimeError: If processing fails.
|
52 |
+
"""
|
53 |
+
try:
|
54 |
+
# Read and encode image to base64
|
55 |
+
with open(file_path, "rb") as img_file:
|
56 |
+
img_data = base64.b64encode(img_file.read()).decode("utf-8")
|
57 |
+
|
58 |
+
# Format the content in a typical vision+text prompt format
|
59 |
+
prompt = {
|
60 |
+
"inputs": {
|
61 |
+
"image": img_data,
|
62 |
+
"question": question
|
63 |
+
}
|
64 |
+
}
|
65 |
+
|
66 |
+
# You can return this dictionary directly if your model expects JSON format
|
67 |
+
return prompt # Actual agent model will process this
|
68 |
+
except Exception as e:
|
69 |
+
raise RuntimeError(f"Image analysis failed: {str(e)}")
|