File size: 6,027 Bytes
edd8809
d7cecb4
edd8809
19ab6fa
d7cecb4
 
 
19ab6fa
 
d7cecb4
19ab6fa
edd8809
d7cecb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edd8809
 
d7cecb4
 
edd8809
d7cecb4
19ab6fa
 
edd8809
 
19ab6fa
edd8809
 
d7cecb4
19ab6fa
d7cecb4
 
19ab6fa
 
 
d7cecb4
19ab6fa
 
 
 
 
 
d7cecb4
19ab6fa
 
 
 
 
 
 
 
d7cecb4
19ab6fa
 
 
 
 
 
d7cecb4
19ab6fa
 
d7cecb4
19ab6fa
 
 
edd8809
 
d7cecb4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import gradio as gr
import arxiv
import traceback
import logging
from typing import List, Dict, Any
from datasets import load_dataset, Dataset
from huggingface_hub import HfApi
from config import DATASET_NAME

# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Arxiv Fetcher logic
def fetch_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
    logging.info(f"Fetching arXiv metadata for query: {query}")
    if not query.strip():
        logging.warning("Empty or whitespace-only query provided")
        return []
    
    client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
    search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
    
    results = []
    try:
        for result in client.results(search):
            metadata = {
                "title": result.title,
                "authors": [author.name for author in result.authors],
                "published": result.published.isoformat(),
                "updated": result.updated.isoformat(),
                "pdf_url": result.pdf_url,
                "entry_id": result.entry_id,
                "summary": result.summary,
                "categories": result.categories,
                "primary_category": result.primary_category,
                "html_url": f"http://arxiv.org/abs/{result.entry_id.split('/')[-1]}"
            }
            results.append(metadata)
        logging.info(f"Fetched metadata for {len(results)} papers")
    except Exception as e:
        logging.error(f"Error fetching metadata: {str(e)}")
    
    return results

# Arxiv Metadata Service logic
class ArxivMetadataService:
    def __init__(self):
        self.hf_api = HfApi()

    def extract_metadata_and_update_dataset(self, query: str, max_results: int = 10) -> str:
        metadata_list = fetch_metadata(query, max_results)
        if not metadata_list:
            return "No metadata found for the given query."
        return self.update_dataset(metadata_list)

    def update_dataset(self, metadata_list: List[Dict[str, Any]]) -> str:
        try:
            # Load the existing dataset
            try:
                dataset = load_dataset(DATASET_NAME, split="train")
                current_data = dataset.to_dict()
            except Exception:
                # If loading fails, start with an empty dictionary
                current_data = {}

            # If the dataset is empty, initialize it with the structure from metadata_list
            if not current_data:
                current_data = {key: [] for key in metadata_list[0].keys()}

            updated = False
            for paper in metadata_list:
                entry_id = paper['entry_id'].split('/')[-1]
                if 'entry_id' not in current_data or entry_id not in current_data['entry_id']:
                    # Add new paper
                    for key, value in paper.items():
                        current_data.setdefault(key, []).append(value)
                    updated = True
                else:
                    # Update existing paper
                    index = current_data['entry_id'].index(entry_id)
                    for key, value in paper.items():
                        if current_data[key][index] != value:
                            current_data[key][index] = value
                            updated = True

            if updated:
                updated_dataset = Dataset.from_dict(current_data)
                updated_dataset.push_to_hub(DATASET_NAME, split="train")
                return f"Successfully updated dataset with {len(metadata_list)} papers"
            else:
                return "No new data to update."
        except Exception as e:
            logging.error(f"Failed to update dataset: {str(e)}")
            return f"Failed to update dataset: {str(e)}"

    def get_dataset_records(self):
        try:
            dataset = load_dataset(DATASET_NAME, split="train")
            records = dataset.to_pandas().to_dict(orient="records")
            return records
        except Exception as e:
            return f"Error loading dataset: {str(e)}"

# Initialize Arxiv Metadata Service
arxiv_service = ArxivMetadataService()

# Define Gradio functions
def handle_metadata_extraction(query: str, max_results: int):
    try:
        result = arxiv_service.extract_metadata_and_update_dataset(query, max_results)
        logging.info(f"Extraction result: {result}")
        return result
    except Exception as e:
        error_msg = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
        logging.error(error_msg)
        return error_msg

def handle_dataset_view():
    try:
        records = arxiv_service.get_dataset_records()
        return records
    except Exception as e:
        return f"Error loading dataset: {str(e)}"

# Define Gradio interface
with gr.Blocks() as demo:
    gr.Markdown(
        f"""Extract metadata from ArXiv papers and update the dataset.
        \n\nCurrently leverages the following datasets:
        \n- [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}/viewer) dataset.
        """
    )
    
    with gr.Tab("Extract Metadata"):
        query_input = gr.Textbox(label="ArXiv Query")
        max_results = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Max Results")
        submit_button = gr.Button("Extract Metadata")
        output = gr.Textbox(label="Result")
        
        submit_button.click(
            fn=handle_metadata_extraction,
            inputs=[query_input, max_results],
            outputs=output
        )
    
    with gr.Tab("View Dataset"):
        refresh_button = gr.Button("Refresh Dataset Info")
        dataset_info = gr.JSON(label="Dataset Info")
        
        refresh_button.click(
            fn=handle_dataset_view,
            inputs=[],
            outputs=dataset_info
        )

if __name__ == "__main__":
    demo.queue()
    demo.launch()