JustKiddo commited on
Commit
3cec6df
·
verified ·
1 Parent(s): 8565096

Delete mt_eng_vietnamese.py

Browse files
Files changed (1) hide show
  1. mt_eng_vietnamese.py +0 -126
mt_eng_vietnamese.py DELETED
@@ -1,126 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2020 HuggingFace Datasets Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
-
17
- import collections
18
-
19
- import datasets
20
-
21
-
22
- _DESCRIPTION = """\
23
- Preprocessed Dataset from IWSLT'15 English-Vietnamese machine translation: English-Vietnamese.
24
- """
25
-
26
- _CITATION = """\
27
- @inproceedings{Luong-Manning:iwslt15,
28
- Address = {Da Nang, Vietnam}
29
- Author = {Luong, Minh-Thang and Manning, Christopher D.},
30
- Booktitle = {International Workshop on Spoken Language Translation},
31
- Title = {Stanford Neural Machine Translation Systems for Spoken Language Domain},
32
- Year = {2015}}
33
- """
34
-
35
- _DATA_URL = "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/{}.{}"
36
-
37
- # Tuple that describes a single pair of files with matching translations.
38
- # language_to_file is the map from language (2 letter string: example 'en')
39
- # to the file path in the extracted directory.
40
- TranslateData = collections.namedtuple("TranslateData", ["url", "language_to_file"])
41
-
42
-
43
- class MT_Eng_ViConfig(datasets.BuilderConfig):
44
- """BuilderConfig for MT_Eng_Vietnamese."""
45
-
46
- def __init__(self, language_pair=(None, None), **kwargs):
47
- """BuilderConfig for MT_Eng_Vi.
48
- Args:
49
- for the `datasets.features.text.TextEncoder` used for the features feature.
50
- language_pair: pair of languages that will be used for translation. Should
51
- contain 2-letter coded strings. First will be used at source and second
52
- as target in supervised mode. For example: ("vi", "en").
53
- **kwargs: keyword arguments forwarded to super.
54
- """
55
-
56
- description = ("Translation dataset from %s to %s") % (language_pair[0], language_pair[1])
57
- super(MT_Eng_ViConfig, self).__init__(
58
- description=description,
59
- version=datasets.Version("1.0.0"),
60
- **kwargs,
61
- )
62
- self.language_pair = language_pair
63
-
64
-
65
- class MTEngVietnamese(datasets.GeneratorBasedBuilder):
66
- """English Vietnamese machine translation dataset from IWSLT2015."""
67
-
68
- BUILDER_CONFIGS = [
69
- MT_Eng_ViConfig(
70
- name="iwslt2015-vi-en",
71
- language_pair=("vi", "en"),
72
- ),
73
- MT_Eng_ViConfig(
74
- name="iwslt2015-en-vi",
75
- language_pair=("en", "vi"),
76
- ),
77
- ]
78
- BUILDER_CONFIG_CLASS = MT_Eng_ViConfig
79
-
80
- def _info(self):
81
- source, target = self.config.language_pair
82
- return datasets.DatasetInfo(
83
- description=_DESCRIPTION,
84
- features=datasets.Features(
85
- {"translation": datasets.features.Translation(languages=self.config.language_pair)}
86
- ),
87
- supervised_keys=(source, target),
88
- homepage="https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/",
89
- citation=_CITATION,
90
- )
91
-
92
- def _split_generators(self, dl_manager):
93
- source, target = self.config.language_pair
94
-
95
- files = {}
96
- for split in ("train", "dev", "test"):
97
- if split == "dev":
98
- dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format("tst2012", source))
99
- dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format("tst2012", target))
100
- if split == "dev":
101
- dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format("tst2013", source))
102
- dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format("tst2013", target))
103
- if split == "train":
104
- dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format(split, source))
105
- dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format(split, target))
106
-
107
- files[split] = {"source_file": dl_dir_src, "target_file": dl_dir_tar}
108
-
109
- return [
110
- datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs=files["train"]),
111
- datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs=files["dev"]),
112
- datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs=files["test"]),
113
- ]
114
-
115
- def _generate_examples(self, source_file, target_file):
116
- """This function returns the examples in the raw (text) form."""
117
- with open(source_file, encoding="utf-8") as f:
118
- source_sentences = f.read().split("\n")
119
- with open(target_file, encoding="utf-8") as f:
120
- target_sentences = f.read().split("\n")
121
-
122
- source, target = self.config.language_pair
123
- for idx, (l1, l2) in enumerate(zip(source_sentences, target_sentences)):
124
- result = {"translation": {source: l1, target: l2}}
125
- # Make sure that both translations are non-empty.
126
- yield idx, result