Spaces:
Paused
Paused
| from typing import List, Dict, Any | |
| from datasets import load_dataset, Dataset | |
| class DatasetManagementService: | |
| def __init__(self, dataset_name: str): | |
| self.dataset_name = dataset_name | |
| def update_dataset(self, new_metadata: List[Dict[str, Any]]) -> str: | |
| try: | |
| dataset = load_dataset(self.dataset_name, split="train") | |
| current_data = dataset.to_dict() | |
| if not current_data: | |
| current_data = {key: [] for key in new_metadata[0].keys()} | |
| updated = False | |
| for paper in new_metadata: | |
| entry_id = paper['entry_id'].split('/')[-1] | |
| if 'entry_id' not in current_data or entry_id not in current_data['entry_id']: | |
| for key, value in paper.items(): | |
| current_data.setdefault(key, []).append(value) | |
| updated = True | |
| else: | |
| index = current_data['entry_id'].index(entry_id) | |
| for key, value in paper.items(): | |
| if current_data[key][index] != value: | |
| current_data[key][index] = value | |
| updated = True | |
| if updated: | |
| updated_dataset = Dataset.from_dict(current_data) | |
| updated_dataset.push_to_hub(self.dataset_name, split="train") | |
| return f"Successfully updated dataset with {len(new_metadata)} papers" | |
| else: | |
| return "No new data to update." | |
| except Exception as e: | |
| return f"Failed to update dataset: {str(e)}" | |
| def get_dataset_records(self) -> List[Dict[str, Any]]: | |
| dataset = load_dataset(self.dataset_name, split="train") | |
| return dataset.to_pandas().to_dict(orient="records") | |
| # Usage: | |
| # dataset_service = DatasetManagementService("your_dataset_name") | |
| # result = dataset_service.update_dataset(new_metadata) | |
| # records = dataset_service.get_dataset_records() | |