File size: 1,604 Bytes
edd8809
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from arxiv_fetcher import fetch_arxiv_metadata
from datasets import load_dataset, Dataset
from config import DATASET_NAME
import logging
from typing import List, Dict, Any

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class ArxivMetadataService:
    def extract_and_update(self, query: str, max_results: int = 10) -> str:
        metadata_list = fetch_arxiv_metadata(query, max_results)
        return self.update_dataset(metadata_list)

    def update_dataset(self, metadata_list: List[Dict[str, Any]]) -> str:
        try:
            dataset = load_dataset(DATASET_NAME, split="train")
            current_data = dataset.to_dict()
            
            for paper in metadata_list:
                if paper['id'] not in current_data.get('id', []):
                    for key, value in paper.items():
                        if key not in current_data:
                            current_data[key] = []
                        current_data[key].append(value)
                else:
                    index = current_data['id'].index(paper['id'])
                    for key, value in paper.items():
                        current_data[key][index] = value
            
            updated_dataset = Dataset.from_dict(current_data)
            updated_dataset.push_to_hub(DATASET_NAME, split="train")
            
            return f"Successfully updated dataset with {len(metadata_list)} papers"
        except Exception as e:
            logging.error(f"Failed to update dataset: {str(e)}")
            return f"Failed to update dataset: {str(e)}"