Spaces:
Sleeping
Sleeping
| # coding=utf-8 | |
| # Copyright 2020 HuggingFace Datasets Authors. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import collections | |
| import datasets | |
| _DESCRIPTION = """\ | |
| Preprocessed Dataset from IWSLT'15 English-Vietnamese machine translation: English-Vietnamese. | |
| """ | |
| _CITATION = """\ | |
| @inproceedings{Luong-Manning:iwslt15, | |
| Address = {Da Nang, Vietnam} | |
| Author = {Luong, Minh-Thang and Manning, Christopher D.}, | |
| Booktitle = {International Workshop on Spoken Language Translation}, | |
| Title = {Stanford Neural Machine Translation Systems for Spoken Language Domain}, | |
| Year = {2015}} | |
| """ | |
| _DATA_URL = "https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/{}.{}" | |
| # Tuple that describes a single pair of files with matching translations. | |
| # language_to_file is the map from language (2 letter string: example 'en') | |
| # to the file path in the extracted directory. | |
| TranslateData = collections.namedtuple("TranslateData", ["url", "language_to_file"]) | |
| class MT_Eng_ViConfig(datasets.BuilderConfig): | |
| """BuilderConfig for MT_Eng_Vietnamese.""" | |
| def __init__(self, language_pair=(None, None), **kwargs): | |
| """BuilderConfig for MT_Eng_Vi. | |
| Args: | |
| for the `datasets.features.text.TextEncoder` used for the features feature. | |
| language_pair: pair of languages that will be used for translation. Should | |
| contain 2-letter coded strings. First will be used at source and second | |
| as target in supervised mode. For example: ("vi", "en"). | |
| **kwargs: keyword arguments forwarded to super. | |
| """ | |
| description = ("Translation dataset from %s to %s") % (language_pair[0], language_pair[1]) | |
| super(MT_Eng_ViConfig, self).__init__( | |
| description=description, | |
| version=datasets.Version("1.0.0"), | |
| **kwargs, | |
| ) | |
| self.language_pair = language_pair | |
| class MTEngVietnamese(datasets.GeneratorBasedBuilder): | |
| """English Vietnamese machine translation dataset from IWSLT2015.""" | |
| BUILDER_CONFIGS = [ | |
| MT_Eng_ViConfig( | |
| name="iwslt2015-vi-en", | |
| language_pair=("vi", "en"), | |
| ), | |
| MT_Eng_ViConfig( | |
| name="iwslt2015-en-vi", | |
| language_pair=("en", "vi"), | |
| ), | |
| ] | |
| BUILDER_CONFIG_CLASS = MT_Eng_ViConfig | |
| def _info(self): | |
| source, target = self.config.language_pair | |
| return datasets.DatasetInfo( | |
| description=_DESCRIPTION, | |
| features=datasets.Features( | |
| {"translation": datasets.features.Translation(languages=self.config.language_pair)} | |
| ), | |
| supervised_keys=(source, target), | |
| homepage="https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/", | |
| citation=_CITATION, | |
| ) | |
| def _split_generators(self, dl_manager): | |
| source, target = self.config.language_pair | |
| files = {} | |
| for split in ("train", "dev", "test"): | |
| if split == "dev": | |
| dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format("tst2012", source)) | |
| dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format("tst2012", target)) | |
| if split == "dev": | |
| dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format("tst2013", source)) | |
| dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format("tst2013", target)) | |
| if split == "train": | |
| dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format(split, source)) | |
| dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format(split, target)) | |
| files[split] = {"source_file": dl_dir_src, "target_file": dl_dir_tar} | |
| return [ | |
| datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs=files["train"]), | |
| datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs=files["dev"]), | |
| datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs=files["test"]), | |
| ] | |
| def _generate_examples(self, source_file, target_file): | |
| """This function returns the examples in the raw (text) form.""" | |
| with open(source_file, encoding="utf-8") as f: | |
| source_sentences = f.read().split("\n") | |
| with open(target_file, encoding="utf-8") as f: | |
| target_sentences = f.read().split("\n") | |
| source, target = self.config.language_pair | |
| for idx, (l1, l2) in enumerate(zip(source_sentences, target_sentences)): | |
| result = {"translation": {source: l1, target: l2}} | |
| # Make sure that both translations are non-empty. | |
| yield idx, result | |