JustKiddo commited on
Commit
021ca1e
·
verified ·
1 Parent(s): 8136c32

Upload mt_eng_vietnamese.py

Browse files
Files changed (1) hide show
  1. mt_eng_vietnamese.py +126 -0
mt_eng_vietnamese.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2020 HuggingFace Datasets Authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ import collections
18
+
19
+ import datasets
20
+
21
+
22
+ _DESCRIPTION = """\
23
+ Preprocessed Dataset from IWSLT'15 English-Vietnamese machine translation: English-Vietnamese.
24
+ """
25
+
26
+ _CITATION = """\
27
+ @inproceedings{Luong-Manning:iwslt15,
28
+ Address = {Da Nang, Vietnam}
29
+ Author = {Luong, Minh-Thang and Manning, Christopher D.},
30
+ Booktitle = {International Workshop on Spoken Language Translation},
31
+ Title = {Stanford Neural Machine Translation Systems for Spoken Language Domain},
32
+ Year = {2015}}
33
+ """
34
+
35
+ _DATA_URL = "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/{}.{}"
36
+
37
+ # Tuple that describes a single pair of files with matching translations.
38
+ # language_to_file is the map from language (2 letter string: example 'en')
39
+ # to the file path in the extracted directory.
40
+ TranslateData = collections.namedtuple("TranslateData", ["url", "language_to_file"])
41
+
42
+
43
+ class MT_Eng_ViConfig(datasets.BuilderConfig):
44
+ """BuilderConfig for MT_Eng_Vietnamese."""
45
+
46
+ def __init__(self, language_pair=(None, None), **kwargs):
47
+ """BuilderConfig for MT_Eng_Vi.
48
+ Args:
49
+ for the `datasets.features.text.TextEncoder` used for the features feature.
50
+ language_pair: pair of languages that will be used for translation. Should
51
+ contain 2-letter coded strings. First will be used at source and second
52
+ as target in supervised mode. For example: ("vi", "en").
53
+ **kwargs: keyword arguments forwarded to super.
54
+ """
55
+
56
+ description = ("Translation dataset from %s to %s") % (language_pair[0], language_pair[1])
57
+ super(MT_Eng_ViConfig, self).__init__(
58
+ description=description,
59
+ version=datasets.Version("1.0.0"),
60
+ **kwargs,
61
+ )
62
+ self.language_pair = language_pair
63
+
64
+
65
+ class MTEngVietnamese(datasets.GeneratorBasedBuilder):
66
+ """English Vietnamese machine translation dataset from IWSLT2015."""
67
+
68
+ BUILDER_CONFIGS = [
69
+ MT_Eng_ViConfig(
70
+ name="iwslt2015-vi-en",
71
+ language_pair=("vi", "en"),
72
+ ),
73
+ MT_Eng_ViConfig(
74
+ name="iwslt2015-en-vi",
75
+ language_pair=("en", "vi"),
76
+ ),
77
+ ]
78
+ BUILDER_CONFIG_CLASS = MT_Eng_ViConfig
79
+
80
+ def _info(self):
81
+ source, target = self.config.language_pair
82
+ return datasets.DatasetInfo(
83
+ description=_DESCRIPTION,
84
+ features=datasets.Features(
85
+ {"translation": datasets.features.Translation(languages=self.config.language_pair)}
86
+ ),
87
+ supervised_keys=(source, target),
88
+ homepage="https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/",
89
+ citation=_CITATION,
90
+ )
91
+
92
+ def _split_generators(self, dl_manager):
93
+ source, target = self.config.language_pair
94
+
95
+ files = {}
96
+ for split in ("train", "dev", "test"):
97
+ if split == "dev":
98
+ dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format("tst2012", source))
99
+ dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format("tst2012", target))
100
+ if split == "dev":
101
+ dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format("tst2013", source))
102
+ dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format("tst2013", target))
103
+ if split == "train":
104
+ dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format(split, source))
105
+ dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format(split, target))
106
+
107
+ files[split] = {"source_file": dl_dir_src, "target_file": dl_dir_tar}
108
+
109
+ return [
110
+ datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs=files["train"]),
111
+ datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs=files["dev"]),
112
+ datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs=files["test"]),
113
+ ]
114
+
115
+ def _generate_examples(self, source_file, target_file):
116
+ """This function returns the examples in the raw (text) form."""
117
+ with open(source_file, encoding="utf-8") as f:
118
+ source_sentences = f.read().split("\n")
119
+ with open(target_file, encoding="utf-8") as f:
120
+ target_sentences = f.read().split("\n")
121
+
122
+ source, target = self.config.language_pair
123
+ for idx, (l1, l2) in enumerate(zip(source_sentences, target_sentences)):
124
+ result = {"translation": {source: l1, target: l2}}
125
+ # Make sure that both translations are non-empty.
126
+ yield idx, result