agents_final_assignment

Sleeping

App Files Files Community

guillaumefrd commited on May 2

Commit

3568413

1 Parent(s): 283e426

add more advanced tools (query image, ASR, code interpreter)

Browse files

Files changed (4) hide show

app.py +0 -1
langgraph_dir/agent.py +9 -9
langgraph_dir/custom_tools.py +69 -12
llamaindex_dir/agent.py +4 -0

app.py CHANGED Viewed

@@ -104,7 +104,6 @@ async def run_and_submit_all(profile: gr.OAuthProfile | None):
                 submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
-            agent.ctx.clear() # clear context for next question
         except Exception as e:
              print(f"Error running agent on task {task_id}: {e}")
              results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})

                 submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:
              print(f"Error running agent on task {task_id}: {e}")
              results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})

langgraph_dir/agent.py CHANGED Viewed

@@ -9,7 +9,8 @@ from langchain.agents import load_tools
 from langchain_community.tools.riza.command import ExecPython
 from .prompt import system_prompt
-from .custom_tools import multiply, add, subtract, divide, modulus, power
 class LangGraphAgent:
@@ -28,18 +29,17 @@ class LangGraphAgent:
             "wikipedia",
         ]
         community_tools = load_tools(community_tool_names)
-        community_tools += [ExecPython()] # Riza code interpreter (needs RIZA_API_KEY) (not supported by load_tools)
-        custom_tools = [multiply, add, subtract, divide, modulus, power]
         tools = community_tools + custom_tools
         tools_by_name = {tool.name: tool for tool in tools}
         llm_with_tools = llm.bind_tools(tools)
-        # tool_spec_list += WikipediaToolSpec().to_tool_list()
-        # tool_spec_list += DuckDuckGoSearchToolSpec().to_tool_list()
-        # tool_spec_list += CodeInterpreterToolSpec().to_tool_list()
-        # tool_spec_list += [query_image_tool, automatic_speech_recognition_tool]
         # =========== Agent definition ===========
         # Nodes

 from langchain_community.tools.riza.command import ExecPython
 from .prompt import system_prompt
+from .custom_tools import (multiply, add, subtract, divide, modulus, power,
+    query_image, automatic_speech_recognition)
 class LangGraphAgent:
             "wikipedia",
         ]
         community_tools = load_tools(community_tool_names)
+        community_tools += [ExecPython(runtime_revision_id='01JT97GJ20BC83Y75WMAS364ZT')] # Riza code interpreter (needs RIZA_API_KEY) (not supported by load_tools, custom runtime with basic packages (pandas, numpy, etc.))
+        custom_tools = [
+            multiply, add, subtract, divide, modulus, power,  # basic arithmetic
+            query_image, # Ask anything about an image using a VLM
+            automatic_speech_recognition, # Transcribe an audio file to text
+        ]
         tools = community_tools + custom_tools
         tools_by_name = {tool.name: tool for tool in tools}
         llm_with_tools = llm.bind_tools(tools)
         # =========== Agent definition ===========
         # Nodes

langgraph_dir/custom_tools.py CHANGED Viewed

@@ -1,9 +1,12 @@
 from langchain_core.tools import tool
 @tool
 def multiply(a: float, b: float) -> float:
-    """
-    Multiplies two numbers.
     Args:
         a (float): the first number
         b (float): the second number
@@ -13,8 +16,8 @@ def multiply(a: float, b: float) -> float:
 @tool
 def add(a: float, b: float) -> float:
-    """
-    Adds two numbers.
     Args:
         a (float): the first number
         b (float): the second number
@@ -24,8 +27,8 @@ def add(a: float, b: float) -> float:
 @tool
 def subtract(a: float, b: float) -> int:
-    """
-    Subtracts two numbers.
     Args:
         a (float): the first number
         b (float): the second number
@@ -35,8 +38,8 @@ def subtract(a: float, b: float) -> int:
 @tool
 def divide(a: float, b: float) -> float:
-    """
-    Divides two numbers.
     Args:
         a (float): the first float number
         b (float): the second float number
@@ -48,8 +51,8 @@ def divide(a: float, b: float) -> float:
 @tool
 def modulus(a: int, b: int) -> int:
-    """
-    Get the modulus of two numbers.
     Args:
         a (int): the first number
         b (int): the second number
@@ -59,10 +62,64 @@ def modulus(a: int, b: int) -> int:
 @tool
 def power(a: float, b: float) -> float:
-    """
-    Get the power of two numbers.
     Args:
         a (float): the first number
         b (float): the second number
     """
     return a**b

 from langchain_core.tools import tool
+from huggingface_hub import InferenceClient
+# --- Basic operations --- #
 @tool
 def multiply(a: float, b: float) -> float:
+    """Multiplies two numbers.
     Args:
         a (float): the first number
         b (float): the second number
 @tool
 def add(a: float, b: float) -> float:
+    """Adds two numbers.
     Args:
         a (float): the first number
         b (float): the second number
 @tool
 def subtract(a: float, b: float) -> int:
+    """Subtracts two numbers.
     Args:
         a (float): the first number
         b (float): the second number
 @tool
 def divide(a: float, b: float) -> float:
+    """Divides two numbers.
     Args:
         a (float): the first float number
         b (float): the second float number
 @tool
 def modulus(a: int, b: int) -> int:
+    """Get the modulus of two numbers.
     Args:
         a (int): the first number
         b (int): the second number
 @tool
 def power(a: float, b: float) -> float:
+    """Get the power of two numbers.
     Args:
         a (float): the first number
         b (float): the second number
     """
     return a**b
+# --- Functions --- #
+@tool
+def query_image(query: str, image_url: str) -> str:
+    """Ask anything about an image using a Vision Language Model
+    Args:
+        query (str): the query about the image, e.g. how many persons are on the image?
+        image_url (str): the URL to the image
+    """
+    client = InferenceClient(provider="nebius")
+    try:
+        completion = client.chat.completions.create(
+            # model="google/gemma-3-27b-it",
+            model="Qwen/Qwen2.5-VL-72B-Instruct",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": query
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url
+                            }
+                        }
+                    ]
+                }
+            ],
+            max_tokens=512,
+        )
+        return completion.choices[0].message
+    except Exception as e:
+        return f"query_image failed: {e}"
+@tool
+def automatic_speech_recognition(file_url: str) -> str:
+    """Transcribe an audio file to text
+    Args:
+        file_url (str): the URL to the audio file
+    """
+    client = InferenceClient(provider="fal-ai")
+    try:
+        return client.automatic_speech_recognition(file_url, model="openai/whisper-large-v3")
+    except Exception as e:
+        return f"automatic_speech_recognition failed: {e}"

llamaindex_dir/agent.py CHANGED Viewed

@@ -71,4 +71,8 @@ class LLamaIndexAgent:
             print('Could not split response on "FINAL ANSWER:"')
         print("\n\n"+"-"*50)
         print(f"Agent returning with answer: {response}")
         return response

             print('Could not split response on "FINAL ANSWER:"')
         print("\n\n"+"-"*50)
         print(f"Agent returning with answer: {response}")
+        # clear context for next question before returning
+        self.ctx.clear()
         return response