File size: 4,325 Bytes
addb0c2
 
d4c1ac1
 
 
 
 
 
 
 
 
 
 
 
 
 
addb0c2
 
 
 
 
 
 
 
 
d4c1ac1
addb0c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4c1ac1
addb0c2
 
 
d4c1ac1
 
addb0c2
 
d4c1ac1
addb0c2
 
 
 
 
 
 
 
 
d4c1ac1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
addb0c2
d4c1ac1
 
addb0c2
d4c1ac1
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import urllib.request
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
import json
import os
from typing import List, Dict
from smolagents import Tool

class ArxivSearchTool(Tool):
    name = "search_arxiv"
    description = "Search ArXiv for papers matching the query"
    input_types = {"query": str, "max_results": int}
    output_type = List[Dict]

    def __call__(self, query: str = "artificial intelligence", 
                max_results: int = 50) -> List[Dict]:
        """Search ArXiv using their API.
        
        Args:
            query: Search query string
            max_results: Maximum number of results to return
            
        Returns:
            List[Dict]: List of paper results with metadata
        """
        try:
            # Construct the API URL
            base_url = 'http://export.arxiv.org/api/query?'
            query_params = {
                'search_query': query,
                'start': 0,
                'max_results': max_results
            }
            
            # Create the full URL
            url = base_url + urllib.parse.urlencode(query_params)
            
            # Make the request
            response = urllib.request.urlopen(url)
            data = response.read().decode('utf-8')
            
            # Parse the Atom XML response
            root = ET.fromstring(data)
            
            # Define the Atom namespace
            ns = {'atom': 'http://www.w3.org/2005/Atom',
                  'arxiv': 'http://arxiv.org/schemas/atom'}
            
            results = []
            for entry in root.findall('atom:entry', ns):
                # Extract paper details
                result = {
                    'title': entry.find('atom:title', ns).text.strip(),
                    'authors': [author.find('atom:name', ns).text 
                              for author in entry.findall('atom:author', ns)],
                    'summary': entry.find('atom:summary', ns).text.strip() if entry.find('atom:summary', ns) is not None else '',
                    'published': entry.find('atom:published', ns).text.strip(),
                    'id': entry.find('atom:id', ns).text.strip(),
                    'pdf_url': next((link.get('href') for link in entry.findall('atom:link', ns) 
                                   if link.get('type') == 'application/pdf'), None),
                    'categories': [cat.get('term') for cat in entry.findall('atom:category', ns)]
                }
                results.append(result)
                
            return results
        except Exception as e:
            return [{"error": f"Error searching ArXiv: {str(e)}"}]

class LatestPapersTool(Tool):
    name = "get_latest_papers"
    description = "Get papers from the last N days from saved results"
    input_types = {"days_back": int}
    output_type = List[Dict]

    def __call__(self, days_back: int = 1) -> List[Dict]:
        papers = []
        base_dir = "daily_papers"
        
        # Get dates to check
        dates = [
            (datetime.now() - timedelta(days=i)).strftime("%Y-%m-%d")
            for i in range(days_back)
        ]
        
        # Load papers for each date
        for date in dates:
            file_path = os.path.join(base_dir, f"ai_papers_{date}.json")
            if os.path.exists(file_path):
                with open(file_path, 'r', encoding='utf-8') as f:
                    day_papers = json.load(f)
                    papers.extend(day_papers)
        
        return papers

def save_daily_papers(output_dir: str = "daily_papers") -> List[Dict]:
    """Helper function to save daily papers - not exposed as a tool"""
    os.makedirs(output_dir, exist_ok=True)
    today = datetime.now().strftime("%Y-%m-%d")
    
    arxiv_tool = ArxivSearchTool()
    papers = arxiv_tool(
        query='cat:cs.AI OR cat:cs.LG OR cat:cs.CL OR "artificial intelligence"',
        max_results=100
    )
    
    # Filter for papers published today
    today_papers = [
        paper for paper in papers 
        if paper.get('published', '').startswith(today)
    ]
    
    output_file = os.path.join(output_dir, f"ai_papers_{today}.json")
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(today_papers, f, indent=2)
    
    return today_papers