First_agent_template

Sleeping

App Files Files Community

First_agent_template / tools /arxiv_tool.py

Ferocious0xide

Update tools/arxiv_tool.py with API specs from Arxiv

addb0c2 verified 4 months ago

raw

history blame

4.33 kB

	import urllib.request
	import xml.etree.ElementTree as ET
	from datetime import datetime, timedelta
	import json
	import os
	from typing import List, Dict
	from smolagents import Tool

	class ArxivSearchTool(Tool):
	name = "search_arxiv"
	description = "Search ArXiv for papers matching the query"
	input_types = {"query": str, "max_results": int}
	output_type = List[Dict]

	def __call__(self, query: str = "artificial intelligence",
	max_results: int = 50) -> List[Dict]:
	"""Search ArXiv using their API.

	Args:
	query: Search query string
	max_results: Maximum number of results to return

	Returns:
	List[Dict]: List of paper results with metadata
	"""
	try:
	# Construct the API URL
	base_url = 'http://export.arxiv.org/api/query?'
	query_params = {
	'search_query': query,
	'start': 0,
	'max_results': max_results
	}

	# Create the full URL
	url = base_url + urllib.parse.urlencode(query_params)

	# Make the request
	response = urllib.request.urlopen(url)
	data = response.read().decode('utf-8')

	# Parse the Atom XML response
	root = ET.fromstring(data)

	# Define the Atom namespace
	ns = {'atom': 'http://www.w3.org/2005/Atom',
	'arxiv': 'http://arxiv.org/schemas/atom'}

	results = []
	for entry in root.findall('atom:entry', ns):
	# Extract paper details
	result = {
	'title': entry.find('atom:title', ns).text.strip(),
	'authors': [author.find('atom:name', ns).text
	for author in entry.findall('atom:author', ns)],
	'summary': entry.find('atom:summary', ns).text.strip() if entry.find('atom:summary', ns) is not None else '',
	'published': entry.find('atom:published', ns).text.strip(),
	'id': entry.find('atom:id', ns).text.strip(),
	'pdf_url': next((link.get('href') for link in entry.findall('atom:link', ns)
	if link.get('type') == 'application/pdf'), None),
	'categories': [cat.get('term') for cat in entry.findall('atom:category', ns)]
	}
	results.append(result)

	return results
	except Exception as e:
	return [{"error": f"Error searching ArXiv: {str(e)}"}]

	class LatestPapersTool(Tool):
	name = "get_latest_papers"
	description = "Get papers from the last N days from saved results"
	input_types = {"days_back": int}
	output_type = List[Dict]

	def __call__(self, days_back: int = 1) -> List[Dict]:
	papers = []
	base_dir = "daily_papers"

	# Get dates to check
	dates = [
	(datetime.now() - timedelta(days=i)).strftime("%Y-%m-%d")
	for i in range(days_back)
	]

	# Load papers for each date
	for date in dates:
	file_path = os.path.join(base_dir, f"ai_papers_{date}.json")
	if os.path.exists(file_path):
	with open(file_path, 'r', encoding='utf-8') as f:
	day_papers = json.load(f)
	papers.extend(day_papers)

	return papers

	def save_daily_papers(output_dir: str = "daily_papers") -> List[Dict]:
	"""Helper function to save daily papers - not exposed as a tool"""
	os.makedirs(output_dir, exist_ok=True)
	today = datetime.now().strftime("%Y-%m-%d")

	arxiv_tool = ArxivSearchTool()
	papers = arxiv_tool(
	query='cat:cs.AI OR cat:cs.LG OR cat:cs.CL OR "artificial intelligence"',
	max_results=100
	)

	# Filter for papers published today
	today_papers = [
	paper for paper in papers
	if paper.get('published', '').startswith(today)
	]

	output_file = os.path.join(output_dir, f"ai_papers_{today}.json")
	with open(output_file, 'w', encoding='utf-8') as f:
	json.dump(today_papers, f, indent=2)

	return today_papers