File size: 4,325 Bytes
addb0c2 d4c1ac1 addb0c2 d4c1ac1 addb0c2 d4c1ac1 addb0c2 d4c1ac1 addb0c2 d4c1ac1 addb0c2 d4c1ac1 addb0c2 d4c1ac1 addb0c2 d4c1ac1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import urllib.request
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
import json
import os
from typing import List, Dict
from smolagents import Tool
class ArxivSearchTool(Tool):
name = "search_arxiv"
description = "Search ArXiv for papers matching the query"
input_types = {"query": str, "max_results": int}
output_type = List[Dict]
def __call__(self, query: str = "artificial intelligence",
max_results: int = 50) -> List[Dict]:
"""Search ArXiv using their API.
Args:
query: Search query string
max_results: Maximum number of results to return
Returns:
List[Dict]: List of paper results with metadata
"""
try:
# Construct the API URL
base_url = 'http://export.arxiv.org/api/query?'
query_params = {
'search_query': query,
'start': 0,
'max_results': max_results
}
# Create the full URL
url = base_url + urllib.parse.urlencode(query_params)
# Make the request
response = urllib.request.urlopen(url)
data = response.read().decode('utf-8')
# Parse the Atom XML response
root = ET.fromstring(data)
# Define the Atom namespace
ns = {'atom': 'http://www.w3.org/2005/Atom',
'arxiv': 'http://arxiv.org/schemas/atom'}
results = []
for entry in root.findall('atom:entry', ns):
# Extract paper details
result = {
'title': entry.find('atom:title', ns).text.strip(),
'authors': [author.find('atom:name', ns).text
for author in entry.findall('atom:author', ns)],
'summary': entry.find('atom:summary', ns).text.strip() if entry.find('atom:summary', ns) is not None else '',
'published': entry.find('atom:published', ns).text.strip(),
'id': entry.find('atom:id', ns).text.strip(),
'pdf_url': next((link.get('href') for link in entry.findall('atom:link', ns)
if link.get('type') == 'application/pdf'), None),
'categories': [cat.get('term') for cat in entry.findall('atom:category', ns)]
}
results.append(result)
return results
except Exception as e:
return [{"error": f"Error searching ArXiv: {str(e)}"}]
class LatestPapersTool(Tool):
name = "get_latest_papers"
description = "Get papers from the last N days from saved results"
input_types = {"days_back": int}
output_type = List[Dict]
def __call__(self, days_back: int = 1) -> List[Dict]:
papers = []
base_dir = "daily_papers"
# Get dates to check
dates = [
(datetime.now() - timedelta(days=i)).strftime("%Y-%m-%d")
for i in range(days_back)
]
# Load papers for each date
for date in dates:
file_path = os.path.join(base_dir, f"ai_papers_{date}.json")
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
day_papers = json.load(f)
papers.extend(day_papers)
return papers
def save_daily_papers(output_dir: str = "daily_papers") -> List[Dict]:
"""Helper function to save daily papers - not exposed as a tool"""
os.makedirs(output_dir, exist_ok=True)
today = datetime.now().strftime("%Y-%m-%d")
arxiv_tool = ArxivSearchTool()
papers = arxiv_tool(
query='cat:cs.AI OR cat:cs.LG OR cat:cs.CL OR "artificial intelligence"',
max_results=100
)
# Filter for papers published today
today_papers = [
paper for paper in papers
if paper.get('published', '').startswith(today)
]
output_file = os.path.join(output_dir, f"ai_papers_{today}.json")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(today_papers, f, indent=2)
return today_papers |