Spaces:
Paused
Paused
simplify code
Browse files- arxiv_fetcher.py +0 -38
- arxiv_metadata_service.py +0 -58
- config.py +1 -12
arxiv_fetcher.py
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
# arxiv_fetcher.py
|
2 |
-
|
3 |
-
import arxiv
|
4 |
-
from typing import List, Dict, Any
|
5 |
-
import logging
|
6 |
-
|
7 |
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
8 |
-
|
9 |
-
def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
|
10 |
-
logging.info(f"Fetching arXiv metadata for query: {query}")
|
11 |
-
if not query.strip():
|
12 |
-
logging.warning("Empty or whitespace-only query provided")
|
13 |
-
return []
|
14 |
-
|
15 |
-
client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
|
16 |
-
search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
|
17 |
-
|
18 |
-
results = []
|
19 |
-
try:
|
20 |
-
for result in client.results(search):
|
21 |
-
metadata = {
|
22 |
-
"title": result.title,
|
23 |
-
"authors": [author.name for author in result.authors],
|
24 |
-
"published": result.published.isoformat(),
|
25 |
-
"updated": result.updated.isoformat(),
|
26 |
-
"pdf_url": result.pdf_url,
|
27 |
-
"entry_id": result.entry_id,
|
28 |
-
"summary": result.summary,
|
29 |
-
"categories": result.categories,
|
30 |
-
"primary_category": result.primary_category,
|
31 |
-
"html_url": f"http://arxiv.org/abs/{result.entry_id.split('/')[-1]}"
|
32 |
-
}
|
33 |
-
results.append(metadata)
|
34 |
-
logging.info(f"Fetched metadata for {len(results)} papers")
|
35 |
-
except Exception as e:
|
36 |
-
logging.error(f"Error fetching metadata: {str(e)}")
|
37 |
-
|
38 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
arxiv_metadata_service.py
DELETED
@@ -1,58 +0,0 @@
|
|
1 |
-
from arxiv_fetcher import fetch_arxiv_metadata
|
2 |
-
from datasets import load_dataset, Dataset
|
3 |
-
from huggingface_hub import HfApi
|
4 |
-
from config import DATASET_NAME
|
5 |
-
import logging
|
6 |
-
from typing import List, Dict, Any
|
7 |
-
|
8 |
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
9 |
-
|
10 |
-
class ArxivMetadataService:
|
11 |
-
def __init__(self):
|
12 |
-
self.hf_api = HfApi()
|
13 |
-
|
14 |
-
def extract_and_update(self, query: str, max_results: int = 10) -> str:
|
15 |
-
metadata_list = fetch_arxiv_metadata(query, max_results)
|
16 |
-
if not metadata_list:
|
17 |
-
return "No metadata found for the given query."
|
18 |
-
return self.update_dataset(metadata_list)
|
19 |
-
|
20 |
-
def update_dataset(self, metadata_list: List[Dict[str, Any]]) -> str:
|
21 |
-
try:
|
22 |
-
# Load the existing dataset
|
23 |
-
try:
|
24 |
-
dataset = load_dataset(DATASET_NAME, split="train")
|
25 |
-
current_data = dataset.to_dict()
|
26 |
-
except Exception:
|
27 |
-
# If loading fails, start with an empty dictionary
|
28 |
-
current_data = {}
|
29 |
-
|
30 |
-
# If the dataset is empty, initialize it with the structure from metadata_list
|
31 |
-
if not current_data:
|
32 |
-
current_data = {key: [] for key in metadata_list[0].keys()}
|
33 |
-
|
34 |
-
updated = False
|
35 |
-
for paper in metadata_list:
|
36 |
-
entry_id = paper['entry_id'].split('/')[-1]
|
37 |
-
if 'entry_id' not in current_data or entry_id not in current_data['entry_id']:
|
38 |
-
# Add new paper
|
39 |
-
for key, value in paper.items():
|
40 |
-
current_data.setdefault(key, []).append(value)
|
41 |
-
updated = True
|
42 |
-
else:
|
43 |
-
# Update existing paper
|
44 |
-
index = current_data['entry_id'].index(entry_id)
|
45 |
-
for key, value in paper.items():
|
46 |
-
if current_data[key][index] != value:
|
47 |
-
current_data[key][index] = value
|
48 |
-
updated = True
|
49 |
-
|
50 |
-
if updated:
|
51 |
-
updated_dataset = Dataset.from_dict(current_data)
|
52 |
-
updated_dataset.push_to_hub(DATASET_NAME, split="train")
|
53 |
-
return f"Successfully updated dataset with {len(metadata_list)} papers"
|
54 |
-
else:
|
55 |
-
return "No new data to update."
|
56 |
-
except Exception as e:
|
57 |
-
logging.error(f"Failed to update dataset: {str(e)}")
|
58 |
-
return f"Failed to update dataset: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config.py
CHANGED
@@ -1,15 +1,4 @@
|
|
1 |
# File: config.py
|
2 |
import os
|
3 |
|
4 |
-
|
5 |
-
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
|
6 |
-
QDRANT_API_URL = os.getenv("QDRANT_API_URL")
|
7 |
-
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
8 |
-
COLLECTION_NAME = "arxiv_papers"
|
9 |
-
DATASET_NAME = "dwb2023/arxiv-papers-dataset"
|
10 |
-
|
11 |
-
LANGCHAIN_PROJECT="arxiv_papers"
|
12 |
-
LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
|
13 |
-
LANGCHAIN_TRACING_V2="true"
|
14 |
-
LANGCHAIN_HUB_PROMPT="rlm/rag-prompt-llama3"
|
15 |
-
LANGCHAIN_API_KEY=os.getenv("LANGCHAIN_API_KEY")
|
|
|
1 |
# File: config.py
|
2 |
import os
|
3 |
|
4 |
+
DATASET_NAME = "dwb2023/arxiv-papers-dataset"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|