# curl -L -o ~/Downloads/arxiv.zip\ # https://www.kaggle.com/api/v1/datasets/download/Cornell-University/arxiv import jsonlines input_path = './data/arxiv-metadata-oai-snapshot.json' output_path = './data/arxiv.jsonl' new_data = [] with jsonlines.open(input_path, 'r') as reader: for item in reader: new_item = { 'bibkey': f"arxivid{item['id']}", 'text': f"Title: {item['title']}\nAbstract: {item['abstract']}\nAuthors: {item['authors']}", } new_data.append(new_item) with jsonlines.open(output_path, 'w') as writer: for item in new_data: writer.write(item)