First_agent_template / tools /arxiv_tool.py
Ferocious0xide's picture
Update tools/arxiv_tool.py with API specs from Arxiv
addb0c2 verified
raw
history blame
4.33 kB
import urllib.request
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
import json
import os
from typing import List, Dict
from smolagents import Tool
class ArxivSearchTool(Tool):
name = "search_arxiv"
description = "Search ArXiv for papers matching the query"
input_types = {"query": str, "max_results": int}
output_type = List[Dict]
def __call__(self, query: str = "artificial intelligence",
max_results: int = 50) -> List[Dict]:
"""Search ArXiv using their API.
Args:
query: Search query string
max_results: Maximum number of results to return
Returns:
List[Dict]: List of paper results with metadata
"""
try:
# Construct the API URL
base_url = 'http://export.arxiv.org/api/query?'
query_params = {
'search_query': query,
'start': 0,
'max_results': max_results
}
# Create the full URL
url = base_url + urllib.parse.urlencode(query_params)
# Make the request
response = urllib.request.urlopen(url)
data = response.read().decode('utf-8')
# Parse the Atom XML response
root = ET.fromstring(data)
# Define the Atom namespace
ns = {'atom': 'http://www.w3.org/2005/Atom',
'arxiv': 'http://arxiv.org/schemas/atom'}
results = []
for entry in root.findall('atom:entry', ns):
# Extract paper details
result = {
'title': entry.find('atom:title', ns).text.strip(),
'authors': [author.find('atom:name', ns).text
for author in entry.findall('atom:author', ns)],
'summary': entry.find('atom:summary', ns).text.strip() if entry.find('atom:summary', ns) is not None else '',
'published': entry.find('atom:published', ns).text.strip(),
'id': entry.find('atom:id', ns).text.strip(),
'pdf_url': next((link.get('href') for link in entry.findall('atom:link', ns)
if link.get('type') == 'application/pdf'), None),
'categories': [cat.get('term') for cat in entry.findall('atom:category', ns)]
}
results.append(result)
return results
except Exception as e:
return [{"error": f"Error searching ArXiv: {str(e)}"}]
class LatestPapersTool(Tool):
name = "get_latest_papers"
description = "Get papers from the last N days from saved results"
input_types = {"days_back": int}
output_type = List[Dict]
def __call__(self, days_back: int = 1) -> List[Dict]:
papers = []
base_dir = "daily_papers"
# Get dates to check
dates = [
(datetime.now() - timedelta(days=i)).strftime("%Y-%m-%d")
for i in range(days_back)
]
# Load papers for each date
for date in dates:
file_path = os.path.join(base_dir, f"ai_papers_{date}.json")
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
day_papers = json.load(f)
papers.extend(day_papers)
return papers
def save_daily_papers(output_dir: str = "daily_papers") -> List[Dict]:
"""Helper function to save daily papers - not exposed as a tool"""
os.makedirs(output_dir, exist_ok=True)
today = datetime.now().strftime("%Y-%m-%d")
arxiv_tool = ArxivSearchTool()
papers = arxiv_tool(
query='cat:cs.AI OR cat:cs.LG OR cat:cs.CL OR "artificial intelligence"',
max_results=100
)
# Filter for papers published today
today_papers = [
paper for paper in papers
if paper.get('published', '').startswith(today)
]
output_file = os.path.join(output_dir, f"ai_papers_{today}.json")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(today_papers, f, indent=2)
return today_papers