donb-hf commited on
Commit
a37fd25
·
1 Parent(s): d7cecb4

simplify code

Browse files
Files changed (3) hide show
  1. arxiv_fetcher.py +0 -38
  2. arxiv_metadata_service.py +0 -58
  3. config.py +1 -12
arxiv_fetcher.py DELETED
@@ -1,38 +0,0 @@
1
- # arxiv_fetcher.py
2
-
3
- import arxiv
4
- from typing import List, Dict, Any
5
- import logging
6
-
7
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
8
-
9
- def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
10
- logging.info(f"Fetching arXiv metadata for query: {query}")
11
- if not query.strip():
12
- logging.warning("Empty or whitespace-only query provided")
13
- return []
14
-
15
- client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
16
- search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
17
-
18
- results = []
19
- try:
20
- for result in client.results(search):
21
- metadata = {
22
- "title": result.title,
23
- "authors": [author.name for author in result.authors],
24
- "published": result.published.isoformat(),
25
- "updated": result.updated.isoformat(),
26
- "pdf_url": result.pdf_url,
27
- "entry_id": result.entry_id,
28
- "summary": result.summary,
29
- "categories": result.categories,
30
- "primary_category": result.primary_category,
31
- "html_url": f"http://arxiv.org/abs/{result.entry_id.split('/')[-1]}"
32
- }
33
- results.append(metadata)
34
- logging.info(f"Fetched metadata for {len(results)} papers")
35
- except Exception as e:
36
- logging.error(f"Error fetching metadata: {str(e)}")
37
-
38
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
arxiv_metadata_service.py DELETED
@@ -1,58 +0,0 @@
1
- from arxiv_fetcher import fetch_arxiv_metadata
2
- from datasets import load_dataset, Dataset
3
- from huggingface_hub import HfApi
4
- from config import DATASET_NAME
5
- import logging
6
- from typing import List, Dict, Any
7
-
8
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
9
-
10
- class ArxivMetadataService:
11
- def __init__(self):
12
- self.hf_api = HfApi()
13
-
14
- def extract_and_update(self, query: str, max_results: int = 10) -> str:
15
- metadata_list = fetch_arxiv_metadata(query, max_results)
16
- if not metadata_list:
17
- return "No metadata found for the given query."
18
- return self.update_dataset(metadata_list)
19
-
20
- def update_dataset(self, metadata_list: List[Dict[str, Any]]) -> str:
21
- try:
22
- # Load the existing dataset
23
- try:
24
- dataset = load_dataset(DATASET_NAME, split="train")
25
- current_data = dataset.to_dict()
26
- except Exception:
27
- # If loading fails, start with an empty dictionary
28
- current_data = {}
29
-
30
- # If the dataset is empty, initialize it with the structure from metadata_list
31
- if not current_data:
32
- current_data = {key: [] for key in metadata_list[0].keys()}
33
-
34
- updated = False
35
- for paper in metadata_list:
36
- entry_id = paper['entry_id'].split('/')[-1]
37
- if 'entry_id' not in current_data or entry_id not in current_data['entry_id']:
38
- # Add new paper
39
- for key, value in paper.items():
40
- current_data.setdefault(key, []).append(value)
41
- updated = True
42
- else:
43
- # Update existing paper
44
- index = current_data['entry_id'].index(entry_id)
45
- for key, value in paper.items():
46
- if current_data[key][index] != value:
47
- current_data[key][index] = value
48
- updated = True
49
-
50
- if updated:
51
- updated_dataset = Dataset.from_dict(current_data)
52
- updated_dataset.push_to_hub(DATASET_NAME, split="train")
53
- return f"Successfully updated dataset with {len(metadata_list)} papers"
54
- else:
55
- return "No new data to update."
56
- except Exception as e:
57
- logging.error(f"Failed to update dataset: {str(e)}")
58
- return f"Failed to update dataset: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.py CHANGED
@@ -1,15 +1,4 @@
1
  # File: config.py
2
  import os
3
 
4
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
5
- QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
6
- QDRANT_API_URL = os.getenv("QDRANT_API_URL")
7
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
8
- COLLECTION_NAME = "arxiv_papers"
9
- DATASET_NAME = "dwb2023/arxiv-papers-dataset"
10
-
11
- LANGCHAIN_PROJECT="arxiv_papers"
12
- LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
13
- LANGCHAIN_TRACING_V2="true"
14
- LANGCHAIN_HUB_PROMPT="rlm/rag-prompt-llama3"
15
- LANGCHAIN_API_KEY=os.getenv("LANGCHAIN_API_KEY")
 
1
  # File: config.py
2
  import os
3
 
4
+ DATASET_NAME = "dwb2023/arxiv-papers-dataset"