Spaces:

JustKiddo
/

KiddosSpace

Sleeping

App Files Files Community

JustKiddo commited on Aug 22, 2024

Commit

021ca1e

verified ·

1 Parent(s): 8136c32

Upload mt_eng_vietnamese.py

Browse files

Files changed (1) hide show

mt_eng_vietnamese.py +126 -0

mt_eng_vietnamese.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# coding=utf-8
+# Copyright 2020 HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import datasets
+_DESCRIPTION = """\
+Preprocessed Dataset from IWSLT'15 English-Vietnamese machine translation: English-Vietnamese.
+"""
+_CITATION = """\
+@inproceedings{Luong-Manning:iwslt15,
+        Address = {Da Nang, Vietnam}
+        Author = {Luong, Minh-Thang  and Manning, Christopher D.},
+        Booktitle = {International Workshop on Spoken Language Translation},
+        Title = {Stanford Neural Machine Translation Systems for Spoken Language Domain},
+        Year = {2015}}
+"""
+_DATA_URL = "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/{}.{}"
+# Tuple that describes a single pair of files with matching translations.
+# language_to_file is the map from language (2 letter string: example 'en')
+# to the file path in the extracted directory.
+TranslateData = collections.namedtuple("TranslateData", ["url", "language_to_file"])
+class MT_Eng_ViConfig(datasets.BuilderConfig):
+    """BuilderConfig for MT_Eng_Vietnamese."""
+    def __init__(self, language_pair=(None, None), **kwargs):
+        """BuilderConfig for MT_Eng_Vi.
+        Args:
+            for the `datasets.features.text.TextEncoder` used for the features feature.
+          language_pair: pair of languages that will be used for translation. Should
+            contain 2-letter coded strings. First will be used at source and second
+            as target in supervised mode. For example: ("vi", "en").
+          **kwargs: keyword arguments forwarded to super.
+        """
+        description = ("Translation dataset from %s to %s") % (language_pair[0], language_pair[1])
+        super(MT_Eng_ViConfig, self).__init__(
+            description=description,
+            version=datasets.Version("1.0.0"),
+            **kwargs,
+        )
+        self.language_pair = language_pair
+class MTEngVietnamese(datasets.GeneratorBasedBuilder):
+    """English Vietnamese machine translation dataset from IWSLT2015."""
+    BUILDER_CONFIGS = [
+        MT_Eng_ViConfig(
+            name="iwslt2015-vi-en",
+            language_pair=("vi", "en"),
+        ),
+        MT_Eng_ViConfig(
+            name="iwslt2015-en-vi",
+            language_pair=("en", "vi"),
+        ),
+    ]
+    BUILDER_CONFIG_CLASS = MT_Eng_ViConfig
+    def _info(self):
+        source, target = self.config.language_pair
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {"translation": datasets.features.Translation(languages=self.config.language_pair)}
+            ),
+            supervised_keys=(source, target),
+            homepage="https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/",
+            citation=_CITATION,
+        )
+    def _split_generators(self, dl_manager):
+        source, target = self.config.language_pair
+        files = {}
+        for split in ("train", "dev", "test"):
+            if split == "dev":
+                dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format("tst2012", source))
+                dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format("tst2012", target))
+            if split == "dev":
+                dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format("tst2013", source))
+                dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format("tst2013", target))
+            if split == "train":
+                dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format(split, source))
+                dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format(split, target))
+            files[split] = {"source_file": dl_dir_src, "target_file": dl_dir_tar}
+        return [
+            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs=files["train"]),
+            datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs=files["dev"]),
+            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs=files["test"]),
+        ]
+    def _generate_examples(self, source_file, target_file):
+        """This function returns the examples in the raw (text) form."""
+        with open(source_file, encoding="utf-8") as f:
+            source_sentences = f.read().split("\n")
+        with open(target_file, encoding="utf-8") as f:
+            target_sentences = f.read().split("\n")
+        source, target = self.config.language_pair
+        for idx, (l1, l2) in enumerate(zip(source_sentences, target_sentences)):
+            result = {"translation": {source: l1, target: l2}}
+            # Make sure that both translations are non-empty.
+            yield idx, result