donb-hf commited on
Commit
660b29c
·
1 Parent(s): 97e8d87

update ds mgmt

Browse files
Files changed (1) hide show
  1. dataset_management_service.py +23 -7
dataset_management_service.py CHANGED
@@ -7,20 +7,31 @@ class DatasetManagementService:
7
 
8
  def update_dataset(self, new_metadata: List[Dict[str, Any]]) -> str:
9
  try:
10
- dataset = load_dataset(self.dataset_name, split="train")
11
- current_data = dataset.to_dict()
 
 
 
 
 
12
 
 
13
  if not current_data:
14
  current_data = {key: [] for key in new_metadata[0].keys()}
15
 
16
  updated = False
17
  for paper in new_metadata:
18
- entry_id = paper['entry_id'].split('/')[-1]
19
- if 'entry_id' not in current_data or entry_id not in current_data['entry_id']:
 
 
 
 
20
  for key, value in paper.items():
21
  current_data.setdefault(key, []).append(value)
22
  updated = True
23
  else:
 
24
  index = current_data['entry_id'].index(entry_id)
25
  for key, value in paper.items():
26
  if current_data[key][index] != value:
@@ -37,10 +48,15 @@ class DatasetManagementService:
37
  return f"Failed to update dataset: {str(e)}"
38
 
39
  def get_dataset_records(self) -> List[Dict[str, Any]]:
40
- dataset = load_dataset(self.dataset_name, split="train")
41
- return dataset.to_pandas().to_dict(orient="records")
 
 
 
 
 
42
 
43
  # Usage:
44
  # dataset_service = DatasetManagementService("your_dataset_name")
45
  # result = dataset_service.update_dataset(new_metadata)
46
- # records = dataset_service.get_dataset_records()
 
7
 
8
  def update_dataset(self, new_metadata: List[Dict[str, Any]]) -> str:
9
  try:
10
+ # Try to load the existing dataset
11
+ try:
12
+ dataset = load_dataset(self.dataset_name, split="train")
13
+ current_data = dataset.to_dict()
14
+ except Exception:
15
+ # If loading fails, start with an empty dictionary
16
+ current_data = {}
17
 
18
+ # If the dataset is empty, initialize it with the structure from new_metadata
19
  if not current_data:
20
  current_data = {key: [] for key in new_metadata[0].keys()}
21
 
22
  updated = False
23
  for paper in new_metadata:
24
+ entry_id = paper['entry_id']
25
+ if 'entry_id' not in current_data:
26
+ current_data['entry_id'] = []
27
+
28
+ if entry_id not in current_data['entry_id']:
29
+ # Add new paper
30
  for key, value in paper.items():
31
  current_data.setdefault(key, []).append(value)
32
  updated = True
33
  else:
34
+ # Update existing paper
35
  index = current_data['entry_id'].index(entry_id)
36
  for key, value in paper.items():
37
  if current_data[key][index] != value:
 
48
  return f"Failed to update dataset: {str(e)}"
49
 
50
  def get_dataset_records(self) -> List[Dict[str, Any]]:
51
+ try:
52
+ dataset = load_dataset(self.dataset_name, split="train")
53
+ if len(dataset) == 0:
54
+ return []
55
+ return dataset.to_pandas().to_dict(orient="records")
56
+ except Exception as e:
57
+ return [{"error": f"Error loading dataset: {str(e)}"}]
58
 
59
  # Usage:
60
  # dataset_service = DatasetManagementService("your_dataset_name")
61
  # result = dataset_service.update_dataset(new_metadata)
62
+ # records = dataset_service.get_dataset_records()