Kaguya-19's picture
init
974817f
raw
history blame contribute delete
628 Bytes
# curl -L -o ~/Downloads/arxiv.zip\
# https://www.kaggle.com/api/v1/datasets/download/Cornell-University/arxiv
import jsonlines
input_path = './data/arxiv-metadata-oai-snapshot.json'
output_path = './data/arxiv.jsonl'
new_data = []
with jsonlines.open(input_path, 'r') as reader:
for item in reader:
new_item = {
'bibkey': f"arxivid{item['id']}",
'text': f"Title: {item['title']}\nAbstract: {item['abstract']}\nAuthors: {item['authors']}",
}
new_data.append(new_item)
with jsonlines.open(output_path, 'w') as writer:
for item in new_data:
writer.write(item)