File size: 3,908 Bytes
69defc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import pandas as pd
import numpy as np

def preprocess_features(feature_file, n_bins=None, min_n_instruments=3, 
        test_ratio=0.05, outlier_range=1.5, conditional=True,
        use_labeled_only=True):

    # Preprocess data
    data = pd.read_csv(feature_file)
    mapper = {"valence": "valence", "note_density_per_instrument": "arousal"}
    data = data.rename(columns=mapper)
    columns = data.columns.to_list()

    # filter out ones with less instruments
    data = data[data["n_instruments"] >= min_n_instruments]
    # filter out ones with zero valence
    data = data[data["valence"] != 0]

    # filter out outliers
    feature_labels = list(mapper.values())
    outlier_indices = []
    for label in feature_labels:
        series = data[label]
        q1 = series.quantile(0.25)
        q3 = series.quantile(0.75)
        iqr = q3 - q1
        upper_limit = q3 + outlier_range * iqr
        lower_limit = q1 - outlier_range * iqr

        outlier_indices += series[series < lower_limit].index.to_list()
        outlier_indices += series[series > upper_limit].index.to_list()
    data.drop(outlier_indices, inplace=True)

    # shift and scale features between -1 and 1
    for label in feature_labels:
        series = data[label]
        min_ = series.min()
        max_ = series.max()
        
        data[label] = (data[label] - min_) / (max_ - min_) * 2 - 1

    if n_bins is not None:
        # digitize into bins using quantiles
        quantile_indices = np.linspace(0, 1, n_bins+1)
        for label in feature_labels:

            # create token labels
            if n_bins % 2 == 0:
                bin_ids = list(range(-n_bins//2, 0)) + list(range(1, n_bins//2+1))
            else:
                bin_ids = list(range(-(n_bins-1)//2, (n_bins-1)//2 + 1))
            token_labels = ["<{}{}>".format(label[0].upper(), bin_id) \
                for bin_id in bin_ids]
            # additional label for NaN (missing) values: <V>
            token_labels.append(None)   # to handle NaNs    
            
            series = data[label]
            quantiles = [series.quantile(q) for q in quantile_indices]
            quantiles[-1] += 1e-6
            series = series.to_numpy()
            series_digitized = np.digitize(series, quantiles)
            series_tokenized = [token_labels[i-1] for i in series_digitized]

            data[label] = series_tokenized
    else:
        # convert NaN into None
        data = data.where(pd.notnull(data), None)

    # Create train and test splits
    matched = data[data["is_matched"]]
    unmatched = data[~data["is_matched"]]

    # reserve a portion of matched data for testing
    matched = matched.sort_values("file")
    matched = matched.reset_index(drop=True)
    n_test_samples = round(len(matched) * test_ratio)

    test_split = matched.loc[len(matched)-n_test_samples:len(matched)]

    train_split = matched.loc[:len(matched)-n_test_samples]

    if not use_labeled_only:
        train_split = pd.concat([train_split, unmatched])
        train_split = train_split.sort_values("file").reset_index(drop=True)

    splits = [train_split, test_split]

    # summarize
    columns_to_drop = [col for col in columns if col not in ["file", "valence", "arousal"]]
    if not conditional:
        columns_to_drop += ["valence", "arousal"]

    # filter data so all features are valid (not None = matched data)
    for label in feature_labels:
        # test split has to be identical across vanilla and conditional models
        splits[1] = splits[1][~splits[1][label].isnull()]

        # filter train split only for conditional models
        if use_labeled_only:
            splits[0] = splits[0][~splits[0][label].isnull()]

    for i in range(len(splits)):
        # summarize
        splits[i] = splits[i].drop(columns=columns_to_drop, errors="ignore")
        splits[i] = splits[i].to_dict("records")    

    return splits