donb-hf commited on
Commit
d7cecb4
·
1 Parent(s): 19ab6fa

simplify app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -12
app.py CHANGED
@@ -1,17 +1,113 @@
1
  import gradio as gr
2
- from arxiv_metadata_service import ArxivMetadataService
3
  import traceback
4
  import logging
 
 
 
5
  from config import DATASET_NAME
6
- from datasets import load_dataset
7
 
 
8
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  arxiv_service = ArxivMetadataService()
11
 
12
- def extract_metadata(query: str, max_results: int):
 
13
  try:
14
- result = arxiv_service.extract_and_update(query, max_results)
15
  logging.info(f"Extraction result: {result}")
16
  return result
17
  except Exception as e:
@@ -19,20 +115,21 @@ def extract_metadata(query: str, max_results: int):
19
  logging.error(error_msg)
20
  return error_msg
21
 
22
- def load_dataset_info():
23
  try:
24
- dataset = load_dataset(DATASET_NAME, split="train")
25
- return f"Dataset contains {len(dataset)} records."
26
  except Exception as e:
27
  return f"Error loading dataset: {str(e)}"
28
 
 
29
  with gr.Blocks() as demo:
30
  gr.Markdown(
31
  f"""Extract metadata from ArXiv papers and update the dataset.
32
  \n\nCurrently leverages the following datasets:
33
  \n- [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}/viewer) dataset.
34
  """
35
- )
36
 
37
  with gr.Tab("Extract Metadata"):
38
  query_input = gr.Textbox(label="ArXiv Query")
@@ -41,20 +138,21 @@ with gr.Blocks() as demo:
41
  output = gr.Textbox(label="Result")
42
 
43
  submit_button.click(
44
- fn=extract_metadata,
45
  inputs=[query_input, max_results],
46
  outputs=output
47
  )
48
 
49
  with gr.Tab("View Dataset"):
50
  refresh_button = gr.Button("Refresh Dataset Info")
51
- dataset_info = gr.Textbox(label="Dataset Info")
52
 
53
  refresh_button.click(
54
- fn=load_dataset_info,
55
  inputs=[],
56
  outputs=dataset_info
57
  )
58
 
59
  if __name__ == "__main__":
60
- demo.launch()
 
 
1
  import gradio as gr
2
+ import arxiv
3
  import traceback
4
  import logging
5
+ from typing import List, Dict, Any
6
+ from datasets import load_dataset, Dataset
7
+ from huggingface_hub import HfApi
8
  from config import DATASET_NAME
 
9
 
10
+ # Logging setup
11
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
12
 
13
+ # Arxiv Fetcher logic
14
+ def fetch_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
15
+ logging.info(f"Fetching arXiv metadata for query: {query}")
16
+ if not query.strip():
17
+ logging.warning("Empty or whitespace-only query provided")
18
+ return []
19
+
20
+ client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
21
+ search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
22
+
23
+ results = []
24
+ try:
25
+ for result in client.results(search):
26
+ metadata = {
27
+ "title": result.title,
28
+ "authors": [author.name for author in result.authors],
29
+ "published": result.published.isoformat(),
30
+ "updated": result.updated.isoformat(),
31
+ "pdf_url": result.pdf_url,
32
+ "entry_id": result.entry_id,
33
+ "summary": result.summary,
34
+ "categories": result.categories,
35
+ "primary_category": result.primary_category,
36
+ "html_url": f"http://arxiv.org/abs/{result.entry_id.split('/')[-1]}"
37
+ }
38
+ results.append(metadata)
39
+ logging.info(f"Fetched metadata for {len(results)} papers")
40
+ except Exception as e:
41
+ logging.error(f"Error fetching metadata: {str(e)}")
42
+
43
+ return results
44
+
45
+ # Arxiv Metadata Service logic
46
+ class ArxivMetadataService:
47
+ def __init__(self):
48
+ self.hf_api = HfApi()
49
+
50
+ def extract_metadata_and_update_dataset(self, query: str, max_results: int = 10) -> str:
51
+ metadata_list = fetch_metadata(query, max_results)
52
+ if not metadata_list:
53
+ return "No metadata found for the given query."
54
+ return self.update_dataset(metadata_list)
55
+
56
+ def update_dataset(self, metadata_list: List[Dict[str, Any]]) -> str:
57
+ try:
58
+ # Load the existing dataset
59
+ try:
60
+ dataset = load_dataset(DATASET_NAME, split="train")
61
+ current_data = dataset.to_dict()
62
+ except Exception:
63
+ # If loading fails, start with an empty dictionary
64
+ current_data = {}
65
+
66
+ # If the dataset is empty, initialize it with the structure from metadata_list
67
+ if not current_data:
68
+ current_data = {key: [] for key in metadata_list[0].keys()}
69
+
70
+ updated = False
71
+ for paper in metadata_list:
72
+ entry_id = paper['entry_id'].split('/')[-1]
73
+ if 'entry_id' not in current_data or entry_id not in current_data['entry_id']:
74
+ # Add new paper
75
+ for key, value in paper.items():
76
+ current_data.setdefault(key, []).append(value)
77
+ updated = True
78
+ else:
79
+ # Update existing paper
80
+ index = current_data['entry_id'].index(entry_id)
81
+ for key, value in paper.items():
82
+ if current_data[key][index] != value:
83
+ current_data[key][index] = value
84
+ updated = True
85
+
86
+ if updated:
87
+ updated_dataset = Dataset.from_dict(current_data)
88
+ updated_dataset.push_to_hub(DATASET_NAME, split="train")
89
+ return f"Successfully updated dataset with {len(metadata_list)} papers"
90
+ else:
91
+ return "No new data to update."
92
+ except Exception as e:
93
+ logging.error(f"Failed to update dataset: {str(e)}")
94
+ return f"Failed to update dataset: {str(e)}"
95
+
96
+ def get_dataset_records(self):
97
+ try:
98
+ dataset = load_dataset(DATASET_NAME, split="train")
99
+ records = dataset.to_pandas().to_dict(orient="records")
100
+ return records
101
+ except Exception as e:
102
+ return f"Error loading dataset: {str(e)}"
103
+
104
+ # Initialize Arxiv Metadata Service
105
  arxiv_service = ArxivMetadataService()
106
 
107
+ # Define Gradio functions
108
+ def handle_metadata_extraction(query: str, max_results: int):
109
  try:
110
+ result = arxiv_service.extract_metadata_and_update_dataset(query, max_results)
111
  logging.info(f"Extraction result: {result}")
112
  return result
113
  except Exception as e:
 
115
  logging.error(error_msg)
116
  return error_msg
117
 
118
+ def handle_dataset_view():
119
  try:
120
+ records = arxiv_service.get_dataset_records()
121
+ return records
122
  except Exception as e:
123
  return f"Error loading dataset: {str(e)}"
124
 
125
+ # Define Gradio interface
126
  with gr.Blocks() as demo:
127
  gr.Markdown(
128
  f"""Extract metadata from ArXiv papers and update the dataset.
129
  \n\nCurrently leverages the following datasets:
130
  \n- [{DATASET_NAME}](https://huggingface.co/datasets/{DATASET_NAME}/viewer) dataset.
131
  """
132
+ )
133
 
134
  with gr.Tab("Extract Metadata"):
135
  query_input = gr.Textbox(label="ArXiv Query")
 
138
  output = gr.Textbox(label="Result")
139
 
140
  submit_button.click(
141
+ fn=handle_metadata_extraction,
142
  inputs=[query_input, max_results],
143
  outputs=output
144
  )
145
 
146
  with gr.Tab("View Dataset"):
147
  refresh_button = gr.Button("Refresh Dataset Info")
148
+ dataset_info = gr.JSON(label="Dataset Info")
149
 
150
  refresh_button.click(
151
+ fn=handle_dataset_view,
152
  inputs=[],
153
  outputs=dataset_info
154
  )
155
 
156
  if __name__ == "__main__":
157
+ demo.queue()
158
+ demo.launch()