Spaces:

JustKiddo
/

KiddosSpace

Sleeping

App Files Files Community

JustKiddo commited on Aug 22, 2024

Commit

3cec6df

verified ·

1 Parent(s): 8565096

Delete mt_eng_vietnamese.py

Browse files

Files changed (1) hide show

mt_eng_vietnamese.py +0 -126

mt_eng_vietnamese.py DELETED Viewed

@@ -1,126 +0,0 @@
-# coding=utf-8
-# Copyright 2020 HuggingFace Datasets Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import datasets
-_DESCRIPTION = """\
-Preprocessed Dataset from IWSLT'15 English-Vietnamese machine translation: English-Vietnamese.
-"""
-_CITATION = """\
-@inproceedings{Luong-Manning:iwslt15,
-        Address = {Da Nang, Vietnam}
-        Author = {Luong, Minh-Thang  and Manning, Christopher D.},
-        Booktitle = {International Workshop on Spoken Language Translation},
-        Title = {Stanford Neural Machine Translation Systems for Spoken Language Domain},
-        Year = {2015}}
-"""
-_DATA_URL = "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/{}.{}"
-# Tuple that describes a single pair of files with matching translations.
-# language_to_file is the map from language (2 letter string: example 'en')
-# to the file path in the extracted directory.
-TranslateData = collections.namedtuple("TranslateData", ["url", "language_to_file"])
-class MT_Eng_ViConfig(datasets.BuilderConfig):
-    """BuilderConfig for MT_Eng_Vietnamese."""
-    def __init__(self, language_pair=(None, None), **kwargs):
-        """BuilderConfig for MT_Eng_Vi.
-        Args:
-            for the `datasets.features.text.TextEncoder` used for the features feature.
-          language_pair: pair of languages that will be used for translation. Should
-            contain 2-letter coded strings. First will be used at source and second
-            as target in supervised mode. For example: ("vi", "en").
-          **kwargs: keyword arguments forwarded to super.
-        """
-        description = ("Translation dataset from %s to %s") % (language_pair[0], language_pair[1])
-        super(MT_Eng_ViConfig, self).__init__(
-            description=description,
-            version=datasets.Version("1.0.0"),
-            **kwargs,
-        )
-        self.language_pair = language_pair
-class MTEngVietnamese(datasets.GeneratorBasedBuilder):
-    """English Vietnamese machine translation dataset from IWSLT2015."""
-    BUILDER_CONFIGS = [
-        MT_Eng_ViConfig(
-            name="iwslt2015-vi-en",
-            language_pair=("vi", "en"),
-        ),
-        MT_Eng_ViConfig(
-            name="iwslt2015-en-vi",
-            language_pair=("en", "vi"),
-        ),
-    ]
-    BUILDER_CONFIG_CLASS = MT_Eng_ViConfig
-    def _info(self):
-        source, target = self.config.language_pair
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=datasets.Features(
-                {"translation": datasets.features.Translation(languages=self.config.language_pair)}
-            ),
-            supervised_keys=(source, target),
-            homepage="https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/",
-            citation=_CITATION,
-        )
-    def _split_generators(self, dl_manager):
-        source, target = self.config.language_pair
-        files = {}
-        for split in ("train", "dev", "test"):
-            if split == "dev":
-                dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format("tst2012", source))
-                dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format("tst2012", target))
-            if split == "dev":
-                dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format("tst2013", source))
-                dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format("tst2013", target))
-            if split == "train":
-                dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format(split, source))
-                dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format(split, target))
-            files[split] = {"source_file": dl_dir_src, "target_file": dl_dir_tar}
-        return [
-            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs=files["train"]),
-            datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs=files["dev"]),
-            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs=files["test"]),
-        ]
-    def _generate_examples(self, source_file, target_file):
-        """This function returns the examples in the raw (text) form."""
-        with open(source_file, encoding="utf-8") as f:
-            source_sentences = f.read().split("\n")
-        with open(target_file, encoding="utf-8") as f:
-            target_sentences = f.read().split("\n")
-        source, target = self.config.language_pair
-        for idx, (l1, l2) in enumerate(zip(source_sentences, target_sentences)):
-            result = {"translation": {source: l1, target: l2}}
-            # Make sure that both translations are non-empty.
-            yield idx, result