File size: 771 Bytes
1bf52db
1482fe1
fdcca24
c5a3a4d
504cec1
5fade0e
d379f83
93aba5e
3424db2
0130907
24a726e
9d3af5f
5e833ac
17499bc
06d3c94
9531255
9116abe
c990a96
e7b5de5
629049e
7be31bf
bf1265d
59fd5bc
c3adb5e
00177a6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import yaml
from datasets import load_dataset
import pandas as pd
import os
import pprint


def make_dataset(dataset="cnn_dailymail", split="train"):
    """make dataset for summarisation"""
        os.makedirs('data/raw')
    dataset = load_dataset(dataset, '3.0.0', split=split)
    if not os.path.exists("data/raw"):
    df = pd.DataFrame()
    df['article'] = dataset['article']
    df['highlights'] = dataset['highlights']
    df.to_csv('data/raw/{}.csv'.format(split))


if __name__ == '__main__':
    with open("params.yml") as f:
        params = yaml.safe_load(f)
    pprint.pprint(params)
    make_dataset(dataset=params['data'], split='train')
    make_dataset(dataset=params['data'], split='test')
    make_dataset(dataset=params['data'], split='validation')