Dkapsis commited on
Commit
a9182c5
·
1 Parent(s): 64c3879

audio agent

Browse files
__pycache__/agents.cpython-310.pyc CHANGED
Binary files a/__pycache__/agents.cpython-310.pyc and b/__pycache__/agents.cpython-310.pyc differ
 
__pycache__/multi_agent.cpython-310.pyc CHANGED
Binary files a/__pycache__/multi_agent.cpython-310.pyc and b/__pycache__/multi_agent.cpython-310.pyc differ
 
__pycache__/prompts.cpython-310.pyc CHANGED
Binary files a/__pycache__/prompts.cpython-310.pyc and b/__pycache__/prompts.cpython-310.pyc differ
 
__pycache__/tools.cpython-310.pyc CHANGED
Binary files a/__pycache__/tools.cpython-310.pyc and b/__pycache__/tools.cpython-310.pyc differ
 
agents.py CHANGED
@@ -42,13 +42,23 @@ def create_image_analysis_agent(message):
42
  name="image_analysis_agent",
43
  description=prompts.get_image_analysis_prompt(message),
44
  model=InferenceClientModel(IMAGE_ANALYSIS_MODEL),
45
- tools=[image_analysis_tool],
 
 
 
 
 
 
 
 
 
46
  max_steps=2,
47
  )
48
 
49
  def create_manager_agent(message):
50
  simple_web_search_agent = create_simple_web_search_agent(message)
51
  image_analysis_agent = create_image_analysis_agent(message)
 
52
 
53
  return CodeAgent(
54
  name="manager_agent",
@@ -60,6 +70,7 @@ def create_manager_agent(message):
60
  managed_agents=[
61
  simple_web_search_agent,
62
  image_analysis_agent,
 
63
  ],
64
  max_steps=10,
65
  additional_authorized_imports=[
@@ -95,6 +106,6 @@ def create_final_answer_agent(message):
95
  name="final_answer_agent",
96
  description="Given a question and an initial answer, return the final refined answer following strict formatting rules.",
97
  model=InferenceClientModel(FINAL_ANSWER_MODEL),
98
- max_steps=2,
99
  tools=[],
100
  )
 
42
  name="image_analysis_agent",
43
  description=prompts.get_image_analysis_prompt(message),
44
  model=InferenceClientModel(IMAGE_ANALYSIS_MODEL),
45
+ tools=[tools.image_analysis_tool],
46
+ max_steps=2,
47
+ )
48
+
49
+ def create_audio_analysis_agent(message):
50
+ return CodeAgent(
51
+ name="audio_analysis_agent",
52
+ description=prompts.get_audio_analysis_prompt(message),
53
+ model=InferenceClientModel(AUDIO_ANALYSIS_MODEL),
54
+ tools=[tools.audio_analysis_tool],
55
  max_steps=2,
56
  )
57
 
58
  def create_manager_agent(message):
59
  simple_web_search_agent = create_simple_web_search_agent(message)
60
  image_analysis_agent = create_image_analysis_agent(message)
61
+ audio_analysis_agent = create_audio_analysis_agent(message)
62
 
63
  return CodeAgent(
64
  name="manager_agent",
 
70
  managed_agents=[
71
  simple_web_search_agent,
72
  image_analysis_agent,
73
+ audio_analysis_agent,
74
  ],
75
  max_steps=10,
76
  additional_authorized_imports=[
 
106
  name="final_answer_agent",
107
  description="Given a question and an initial answer, return the final refined answer following strict formatting rules.",
108
  model=InferenceClientModel(FINAL_ANSWER_MODEL),
109
+ max_steps=3,
110
  tools=[],
111
  )
app.py CHANGED
@@ -161,7 +161,9 @@ with gr.Blocks() as demo:
161
  """
162
  **Instructions:**
163
 
164
- 4. who is in the final of champions league this year?
 
 
165
 
166
  """
167
  )
 
161
  """
162
  **Instructions:**
163
 
164
+ 1. Who is in the final of champions league in 2025?
165
+ 2. What is the colour of the suit in this image: https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Fimages.hdqwalls.com%2Fwallpapers%2Fblack-superman-henry-cavill-xa.jpg&f=1&nofb=1&ipt=451cdc8bb05635ac59e50dc567cb68ae38ad45a626622ee7760b2c3ef828d5a7?
166
+ 3. Which of the fruits shown in the 2008 painting “Embroidery from Uzbekistan” were served as part of the October 1949 breakfast menu for the ocean liner that was later used as a floating prop for the film “The Last Voyage”? Give the items as a comma-separated list, ordering them in clockwise order based on their arrangement in the painting starting from the 12 o’clock position. Use the plural form of each fruit.
167
 
168
  """
169
  )
multi_agent.py CHANGED
@@ -11,7 +11,7 @@ import agents
11
 
12
  def orchestrate(message, file_path):
13
  final_prompt = prompts.get_manager_prompt(message, file_path)
14
- initial_answer = agents.create_simple_web_search_agent(message).run(message)
15
 
16
  final_answer = agents.create_final_answer_agent(message).run(prompts.get_final_answer_prompt(message, initial_answer))
17
 
 
11
 
12
  def orchestrate(message, file_path):
13
  final_prompt = prompts.get_manager_prompt(message, file_path)
14
+ initial_answer = agents.create_manager_agent(message).run(message)
15
 
16
  final_answer = agents.create_final_answer_agent(message).run(prompts.get_final_answer_prompt(message, initial_answer))
17
 
prompts.py CHANGED
@@ -14,6 +14,13 @@ def get_image_analysis_prompt(message, file_path=None):
14
 
15
  return prompt
16
 
 
 
 
 
 
 
 
17
  def get_manager_prompt(message, file_path=None):
18
  prompt = f"""Your job is to answer the following question.
19
  Answer the following question. If needed, delegate to one of your coworkers:\n
@@ -26,6 +33,8 @@ def get_manager_prompt(message, file_path=None):
26
 
27
  In case you cannot answer the question and there is not a good coworker, delegate to the Code Generation Agent.\n.
28
 
 
 
29
  Question: {message}
30
  """
31
 
@@ -55,6 +64,8 @@ def get_final_answer_prompt(message: str, initial_answer: str):
55
  **Example 5:** What is the opposite of bad, worse, worst? good, better, best
56
 
57
  **Final answer:**
 
 
58
  """
59
 
60
  return prompt
 
14
 
15
  return prompt
16
 
17
+ def get_audio_analysis_prompt(message, file_path=None):
18
+ prompt = f"""
19
+ As an expert audio analysis assistant, you analyze the audio to answer the question. Given a question and audio file, analyze the audio and answer the question: {message}
20
+ """
21
+
22
+ return prompt
23
+
24
  def get_manager_prompt(message, file_path=None):
25
  prompt = f"""Your job is to answer the following question.
26
  Answer the following question. If needed, delegate to one of your coworkers:\n
 
33
 
34
  In case you cannot answer the question and there is not a good coworker, delegate to the Code Generation Agent.\n.
35
 
36
+ The final answer must always be a string and no other formats are acceptable.
37
+
38
  Question: {message}
39
  """
40
 
 
64
  **Example 5:** What is the opposite of bad, worse, worst? good, better, best
65
 
66
  **Final answer:**
67
+
68
+ The final answer must always be a string and no other formats are acceptable.
69
  """
70
 
71
  return prompt
tools.py CHANGED
@@ -66,4 +66,36 @@ def image_analysis_tool(question: str, file_path: str) -> str:
66
  # You can return this dictionary directly if your model expects JSON format
67
  return prompt # Actual agent model will process this
68
  except Exception as e:
69
- raise RuntimeError(f"Image analysis failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  # You can return this dictionary directly if your model expects JSON format
67
  return prompt # Actual agent model will process this
68
  except Exception as e:
69
+ raise RuntimeError(f"Image analysis failed: {str(e)}")
70
+
71
+ @tool
72
+ def audio_analysis_tool(question: str, file_path: str) -> str:
73
+ """
74
+ Given a question and an audio file path, analyze the audio to answer the question.
75
+
76
+ Args:
77
+ question (str): A question about the audio.
78
+ file_path (str): Path to the audio file.
79
+
80
+ Returns:
81
+ str: Structured prompt with audio and question (for agent model to process).
82
+
83
+ Raises:
84
+ RuntimeError: If processing fails.
85
+ """
86
+ try:
87
+ # Read and encode audio to base64
88
+ with open(file_path, "rb") as audio_file:
89
+ audio_data = base64.b64encode(audio_file.read()).decode("utf-8")
90
+
91
+ # Format the content in a vision+text style prompt, adapted for audio
92
+ prompt = {
93
+ "inputs": {
94
+ "audio": audio_data,
95
+ "question": question
96
+ }
97
+ }
98
+
99
+ return prompt # The agent model will process this
100
+ except Exception as e:
101
+ raise RuntimeError(f"Audio analysis failed: {str(e)}")