|
import asyncio |
|
import base64 |
|
import pickle |
|
from io import BytesIO |
|
from pathlib import Path |
|
from typing import Any |
|
import os |
|
|
|
import cv2 |
|
import polars as pl |
|
from dotenv import find_dotenv, load_dotenv |
|
from langchain.chat_models import init_chat_model |
|
from langchain_core.messages import HumanMessage |
|
from langgraph.errors import GraphRecursionError |
|
from langgraph.prebuilt import create_react_agent |
|
from PIL import Image |
|
from pydantic import BaseModel, Field |
|
from smolagents import ( |
|
DuckDuckGoSearchTool, |
|
Tool, |
|
VisitWebpageTool, |
|
WikipediaSearchTool, |
|
) |
|
os.environ["OPENAI_API_KEY"] = "sk-dumykey" |
|
|
|
|
|
|
|
_ = load_dotenv(dotenv_path=find_dotenv(raise_error_if_not_found=False), override=True) |
|
with open("all_questions.pkl", "rb") as f: |
|
all_questions = pickle.load(f) |
|
lang_model = init_chat_model( |
|
model="gpt-4.1-mini", model_provider="openai", temperature=0.2 |
|
) |
|
|
|
|
|
def search_wikipedia(query: str) -> str: |
|
"""Tries to search for a wikipedia page relevant to the query and if it finds |
|
then it returns the content of this page.""" |
|
wiki_search = WikipediaSearchTool(user_agent="WikiAssistant ([email protected])") |
|
content = wiki_search(query) |
|
return content |
|
|
|
|
|
def visit_web_page(url: str) -> str: |
|
"""Use this tool to visit websites.""" |
|
visit_webpage = VisitWebpageTool(max_output_length=60_000) |
|
web_content = visit_webpage(url) |
|
return web_content |
|
|
|
|
|
def read_excel_or_csv(filepath: str) -> str: |
|
"""Reads an excel or csv file and returns the content as str.""" |
|
if Path(filepath).suffix in {".xlsx", ".xls"}: |
|
df = pl.read_excel(source=filepath) |
|
df = pl.read_csv(source=filepath) |
|
content_str = df.to_dict(as_series=False).__str__() |
|
return content_str |
|
|
|
|
|
def python_code_interpreter(filepath: str) -> Any: |
|
"""Returns the output of a python code.""" |
|
with open(filepath, "r") as f: |
|
code = f.readlines() |
|
code_result = lang_model.generate( |
|
messages=[ |
|
[ |
|
HumanMessage( |
|
content=[ |
|
{ |
|
"type": "text", |
|
"text": f"What's the result of this code: {code}. Return only the output without any explanation.", |
|
}, |
|
] |
|
) |
|
] |
|
] |
|
) |
|
return code_result.generations[0][0].text |
|
|
|
def python_executor(code_str:str) -> str: |
|
"""This executes python code. The code must be a string. |
|
For any calculations always use numpy.""" |
|
lpe = LocalPythonExecutor(additional_authorized_imports=['polars.*', 'numpy.*']) |
|
code_res = lpe(code_action=code_str)[0] |
|
return code_res |
|
|
|
stt_tool = Tool.from_space( |
|
space_id="UNSAFESUPERINTELLIGENCE/Minimum-OpenAI-Whisper", |
|
description="Speech to Text Tool", |
|
name="stt_tool", |
|
) |
|
|
|
|
|
def call_stt_tool(file_url: str) -> str: |
|
"""Speech to text tool.""" |
|
transcribed_text = stt_tool(file_url) |
|
return transcribed_text |
|
|
|
|
|
def encode_image_to_base64(image_path: str) -> bytes: |
|
image = Image.open(image_path) |
|
buffered = BytesIO() |
|
image.save(buffered, format="JPEG") |
|
return base64.b64encode(buffered.getvalue()).decode("utf-8") |
|
|
|
|
|
def image_tool(file_url: str) -> str: |
|
"""Describes an image in detail.""" |
|
img_resp = lang_model.generate( |
|
messages=[ |
|
[ |
|
HumanMessage( |
|
content=[ |
|
{ |
|
"type": "text", |
|
"text": "Describe the image in detail and return only the description without any additional content.", |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": { |
|
"url": f"data:image/jpeg;base64,{encode_image_to_base64(image_path=file_url)}" |
|
}, |
|
}, |
|
] |
|
) |
|
] |
|
] |
|
) |
|
|
|
return img_resp.generations[0][0].text |
|
|
|
|
|
|
|
|
|
def youtube_video_tool(url: str) -> str: |
|
"""Answers questions about youtube videos. |
|
URLs must be provided to this tool.""" |
|
yt_vid_mapping = { |
|
"https://www.youtube.com/watch?v=L1vXCYZAYYM": "penguin.mp4", |
|
"https://www.youtube.com/watch?v=1htKBjuUWec": "coffee.mp4", |
|
} |
|
video = cv2.VideoCapture(filename=yt_vid_mapping[url]) |
|
|
|
base64Frames = [] |
|
while video.isOpened(): |
|
success, frame = video.read() |
|
if not success: |
|
break |
|
_, buffer = cv2.imencode(".jpg", frame) |
|
base64Frames.append(base64.b64encode(s=buffer).decode(encoding="utf-8")) |
|
|
|
video.release() |
|
vid_resp = lang_model.generate( |
|
messages=[ |
|
[ |
|
HumanMessage( |
|
content=[ |
|
{ |
|
"type": "text", |
|
"text": ( |
|
"""Examine the video.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"""" |
|
), |
|
}, |
|
*[ |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{frame}"}, |
|
} |
|
for frame in base64Frames[0::25] |
|
], |
|
], |
|
) |
|
] |
|
] |
|
) |
|
return vid_resp.generations[0][0].text |
|
|
|
|
|
def web_search_tool(query: str) -> str: |
|
"""Use a search engine to search the web for general information.""" |
|
ddg_tool = DuckDuckGoSearchTool(max_results=5) |
|
search_res = ddg_tool(query) |
|
return search_res |
|
|
|
|
|
class AnswerFormat(BaseModel): |
|
"""Pydantic model for the answer format instructions. |
|
|
|
Attributes: |
|
thoughts (str): The reasoning or thoughts before the answer. |
|
answer (str | int | list[str | int]): The final answer, following strict formatting rules. |
|
""" |
|
|
|
thoughts: str = Field(..., description="Report your thoughts before the answer.") |
|
answer: str | int | list[str | int] = Field( |
|
..., |
|
description=( |
|
"The answer should be a number (no commas or units), " |
|
"a few words (no articles or abbreviations), " |
|
"or a comma-separated list of numbers/strings, " |
|
"following all specified formatting rules." |
|
), |
|
) |
|
|
|
|
|
|
|
|
|
SYSTEM_PROMPT = """ |
|
You are an expert agent - please keep going until the user’s query is completely resolved, before ending your turn and yielding back to the user. Only terminate your turn when you are sure that the problem is solved. |
|
|
|
|
|
# Instructions |
|
- Carefully read and understand the task. Sometimes the task might be a sentence reversed, so un reverse it first and then complete the task. |
|
- Sometimes the task will be accompanied with a file, and the file name will be provided to you. If no file is provided to you don't try looking for a file, for instance "discograpy". |
|
- If you are not sure about file content or codebase structure pertaining to the user’s request, use your tools to read files and gather the relevant information: do NOT guess or make up an answer. |
|
- You can use a combination of tools to complete the task, however, you don't have to use the tools all the time. |
|
- Before using any tool always check what's the input/s that the tool expects and provide the input accordingly. Extract any necessary information from the query given to you for the tool call. |
|
- You have access to the following tools: `search_wikipedia`, `visit_web_page`, `read_excel_or_csv`, `python_executor`, `python_code_interpreter`, `call_stt_tool`, `image_tool`, `youtube_video_tool`, `web_search_tool`. |
|
- If a python file is given to you, then use the `python_code_interpreter` and the input to the tool should be the file name. |
|
- For any youtube related task use the `youtube_video_tool` and the input to the tool should be URL as a string along with the query. |
|
- For any dataframe related tasks, always use the `read_excel_or_csv` tool. |
|
- If the `search_wikipedia` tool has provided a page, then no need to call `visit_web_page` for the same wikipedia page, instead use the content that's provided by the `search_wikipedia` tool. |
|
- You MUST plan extensively before each tool call, and reflect extensively on the outcomes of the previous tool calls. DO NOT do this entire process by making tool calls only, as this can impair your ability to solve the problem and think insightfully. |
|
- Always verify your answers. |
|
|
|
|
|
#Output Format |
|
- YOUR ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. |
|
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. |
|
- If you are asked for a string, don't use articles, neither abbreviations(e.g. for cities), and write the digits in plain text unless specified otherwise. |
|
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. |
|
- Leverage the `AnswerFormat` pydantic class to output the answer. |
|
|
|
# Example |
|
## User |
|
Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal\'c say in response to the question "Isn\'t that hot?" |
|
|
|
## Assistant |
|
<Internal thought> First let me extract the youtube url and then use the `youtube_video_tool` to answer this query.</Internal thought> |
|
### Tool Calls |
|
youtube_video_tool(https://www.youtube.com/watch?v=1htKBjuUWec) |
|
|
|
// After tool call, the assistant would follow up with the response from the tool: |
|
|
|
<Internal thought> I need to format the tool response as per the expected output. </Internal thought> |
|
|
|
## Assistant response |
|
### Message |
|
"The correct answer." |
|
|
|
# Example 2 |
|
## User |
|
What's 2 +2 ? |
|
|
|
## Assistant response |
|
### Message |
|
4 |
|
|
|
""" |
|
|
|
|
|
agent = create_react_agent( |
|
model=lang_model, |
|
tools=[ |
|
search_wikipedia, |
|
visit_web_page, |
|
read_excel_or_csv, |
|
python_executor, |
|
python_code_interpreter, |
|
call_stt_tool, |
|
image_tool, |
|
youtube_video_tool, |
|
web_search_tool, |
|
], |
|
prompt=SYSTEM_PROMPT, |
|
response_format=AnswerFormat, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
async def run_agent(): |
|
results = [] |
|
for q in all_questions: |
|
try: |
|
answer = await agent.ainvoke( |
|
|
|
input={ |
|
"messages": f"""Complete the following task: {q["question"]}. Relevant file: { |
|
q["file_name"] |
|
if q["file_name"] |
|
else "There's no relevant file to use." |
|
}""" |
|
} |
|
) |
|
results.append(answer) |
|
except GraphRecursionError: |
|
print("❌ Agent stopped due to max iterations.") |
|
results.append(q["task_id"]) |
|
return results |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|