Spaces:
Running
Running
| from __future__ import absolute_import, print_function, division, unicode_literals | |
| import test_helper | |
| import json | |
| from torchmoji.sentence_tokenizer import SentenceTokenizer | |
| sentences = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'] | |
| dicts = [ | |
| {'label': 0}, | |
| {'label': 1}, | |
| {'label': 2}, | |
| {'label': 3}, | |
| {'label': 4}, | |
| {'label': 5}, | |
| {'label': 6}, | |
| {'label': 7}, | |
| {'label': 8}, | |
| {'label': 9}, | |
| ] | |
| train_ind = [0, 5, 3, 6, 8] | |
| val_ind = [9, 2, 1] | |
| test_ind = [4, 7] | |
| with open('../model/vocabulary.json', 'r') as f: | |
| vocab = json.load(f) | |
| def test_dataset_split_parameter(): | |
| """ Dataset is split in the desired ratios | |
| """ | |
| split_parameter = [0.7, 0.1, 0.2] | |
| st = SentenceTokenizer(vocab, 30) | |
| result, result_dicts, _ = st.split_train_val_test(sentences, dicts, | |
| split_parameter, extend_with=0) | |
| train = result[0] | |
| val = result[1] | |
| test = result[2] | |
| train_dicts = result_dicts[0] | |
| val_dicts = result_dicts[1] | |
| test_dicts = result_dicts[2] | |
| assert len(train) == len(sentences) * split_parameter[0] | |
| assert len(val) == len(sentences) * split_parameter[1] | |
| assert len(test) == len(sentences) * split_parameter[2] | |
| assert len(train_dicts) == len(dicts) * split_parameter[0] | |
| assert len(val_dicts) == len(dicts) * split_parameter[1] | |
| assert len(test_dicts) == len(dicts) * split_parameter[2] | |
| def test_dataset_split_explicit(): | |
| """ Dataset is split according to given indices | |
| """ | |
| split_parameter = [train_ind, val_ind, test_ind] | |
| st = SentenceTokenizer(vocab, 30) | |
| tokenized, _, _ = st.tokenize_sentences(sentences) | |
| result, result_dicts, added = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0) | |
| train = result[0] | |
| val = result[1] | |
| test = result[2] | |
| train_dicts = result_dicts[0] | |
| val_dicts = result_dicts[1] | |
| test_dicts = result_dicts[2] | |
| tokenized = tokenized | |
| for i, sentence in enumerate(sentences): | |
| if i in train_ind: | |
| assert tokenized[i] in train | |
| assert dicts[i] in train_dicts | |
| elif i in val_ind: | |
| assert tokenized[i] in val | |
| assert dicts[i] in val_dicts | |
| elif i in test_ind: | |
| assert tokenized[i] in test | |
| assert dicts[i] in test_dicts | |
| assert len(train) == len(train_ind) | |
| assert len(val) == len(val_ind) | |
| assert len(test) == len(test_ind) | |
| assert len(train_dicts) == len(train_ind) | |
| assert len(val_dicts) == len(val_ind) | |
| assert len(test_dicts) == len(test_ind) | |
| def test_id_to_sentence(): | |
| """Tokenizing and converting back preserves the input. | |
| """ | |
| vb = {'CUSTOM_MASK': 0, | |
| 'aasdf': 1000, | |
| 'basdf': 2000} | |
| sentence = 'aasdf basdf basdf basdf' | |
| st = SentenceTokenizer(vb, 30) | |
| token, _, _ = st.tokenize_sentences([sentence]) | |
| assert st.to_sentence(token[0]) == sentence | |
| def test_id_to_sentence_with_unknown(): | |
| """Tokenizing and converting back preserves the input, except for unknowns. | |
| """ | |
| vb = {'CUSTOM_MASK': 0, | |
| 'CUSTOM_UNKNOWN': 1, | |
| 'aasdf': 1000, | |
| 'basdf': 2000} | |
| sentence = 'aasdf basdf ccc' | |
| expected = 'aasdf basdf CUSTOM_UNKNOWN' | |
| st = SentenceTokenizer(vb, 30) | |
| token, _, _ = st.tokenize_sentences([sentence]) | |
| assert st.to_sentence(token[0]) == expected | |