Upload librispeech_asr.py
Browse files- librispeech_asr.py +132 -0
    	
        librispeech_asr.py
    ADDED
    
    | @@ -0,0 +1,132 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
             | 
| 2 | 
            +
            """Librispeech automatic speech recognition dataset."""
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            import os
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            import datasets
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            _CITATION = """\
         | 
| 9 | 
            +
            @inproceedings{panayotov2015librispeech,
         | 
| 10 | 
            +
              title={Librispeech: an ASR corpus based on public domain audio books},
         | 
| 11 | 
            +
              author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
         | 
| 12 | 
            +
              booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},
         | 
| 13 | 
            +
              pages={5206--5210},
         | 
| 14 | 
            +
              year={2015},
         | 
| 15 | 
            +
              organization={IEEE}
         | 
| 16 | 
            +
            }
         | 
| 17 | 
            +
            """
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            _DESCRIPTION = """\
         | 
| 20 | 
            +
            LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,
         | 
| 21 | 
            +
            prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read
         | 
| 22 | 
            +
            audiobooks from the LibriVox project, and has been carefully segmented and aligned.87
         | 
| 23 | 
            +
            """
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            _URL = "http://www.openslr.org/12"
         | 
| 26 | 
            +
            _DL_URL = "http://www.openslr.org/resources/12/"
         | 
| 27 | 
            +
             | 
| 28 | 
            +
            _DL_URLS = {"test": _DL_URL + "test-clean.tar.gz",
         | 
| 29 | 
            +
                        "train.100": _DL_URL + "train-clean-100.tar.gz",
         | 
| 30 | 
            +
                    }
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            class LibrispeechASRConfig(datasets.BuilderConfig):
         | 
| 33 | 
            +
                """BuilderConfig for LibriSpeechASR."""
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                def __init__(self, **kwargs):
         | 
| 36 | 
            +
                    """
         | 
| 37 | 
            +
                    Args:
         | 
| 38 | 
            +
                      data_dir: `string`, the path to the folder containing the files in the
         | 
| 39 | 
            +
                        downloaded .tar
         | 
| 40 | 
            +
                      citation: `string`, citation for the data set
         | 
| 41 | 
            +
                      url: `string`, url for information about the data set
         | 
| 42 | 
            +
                      **kwargs: keyword arguments forwarded to super.
         | 
| 43 | 
            +
                    """
         | 
| 44 | 
            +
                    super(LibrispeechASRConfig, self).__init__(version=datasets.Version("2.1.0", ""), **kwargs)
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            class LibrispeechASR(datasets.GeneratorBasedBuilder):
         | 
| 47 | 
            +
                """Librispeech dataset."""
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                DEFAULT_WRITER_BATCH_SIZE = 256
         | 
| 50 | 
            +
                DEFAULT_CONFIG_NAME = "all"
         | 
| 51 | 
            +
                BUILDER_CONFIG = LibrispeechASRConfig(name="clean", description="'Clean' speech.")
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                def _info(self):
         | 
| 54 | 
            +
                    return datasets.DatasetInfo(
         | 
| 55 | 
            +
                        description=_DESCRIPTION,
         | 
| 56 | 
            +
                        features=datasets.Features(
         | 
| 57 | 
            +
                            {
         | 
| 58 | 
            +
                                "file": datasets.Value("string"),
         | 
| 59 | 
            +
                                "audio": datasets.Audio(sampling_rate=16_000),
         | 
| 60 | 
            +
                                "text": datasets.Value("string"),
         | 
| 61 | 
            +
                                "speaker_id": datasets.Value("int64"),
         | 
| 62 | 
            +
                                "chapter_id": datasets.Value("int64"),
         | 
| 63 | 
            +
                                "id": datasets.Value("string"),
         | 
| 64 | 
            +
                            }
         | 
| 65 | 
            +
                        ),
         | 
| 66 | 
            +
                        supervised_keys=("file", "text"),
         | 
| 67 | 
            +
                        homepage=_URL,
         | 
| 68 | 
            +
                        citation=_CITATION,
         | 
| 69 | 
            +
                    )
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                def _split_generators(self, dl_manager):
         | 
| 72 | 
            +
                    archive_path = dl_manager.download(_DL_URLS)
         | 
| 73 | 
            +
                    # (Optional) In non-streaming mode, we can extract the archive locally to have actual local audio files:
         | 
| 74 | 
            +
                    local_extracted_archive = dl_manager.extract(archive_path) if not dl_manager.is_streaming else {}
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                    train_split = [
         | 
| 77 | 
            +
                        datasets.SplitGenerator(
         | 
| 78 | 
            +
                            name="train.100",
         | 
| 79 | 
            +
                            gen_kwargs={
         | 
| 80 | 
            +
                                "local_extracted_archive": local_extracted_archive.get("train.100"),
         | 
| 81 | 
            +
                                "files": dl_manager.iter_archive(archive_path["train.100"]),
         | 
| 82 | 
            +
                            },
         | 
| 83 | 
            +
                        ),
         | 
| 84 | 
            +
                    ]
         | 
| 85 | 
            +
                    test_split = [
         | 
| 86 | 
            +
                        datasets.SplitGenerator(
         | 
| 87 | 
            +
                            name=datasets.Split.TEST,
         | 
| 88 | 
            +
                            gen_kwargs={
         | 
| 89 | 
            +
                                "local_extracted_archive": local_extracted_archive.get("test"),
         | 
| 90 | 
            +
                                "files": dl_manager.iter_archive(archive_path["test"]),
         | 
| 91 | 
            +
                            },
         | 
| 92 | 
            +
                        )
         | 
| 93 | 
            +
                    ]
         | 
| 94 | 
            +
                    return train_split + test_split
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                def _generate_examples(self, files, local_extracted_archive):
         | 
| 97 | 
            +
                    """Generate examples from a LibriSpeech archive_path."""
         | 
| 98 | 
            +
                    key = 0
         | 
| 99 | 
            +
                    audio_data = {}
         | 
| 100 | 
            +
                    transcripts = []
         | 
| 101 | 
            +
                    for path, f in files:
         | 
| 102 | 
            +
                        if path.endswith(".flac"):
         | 
| 103 | 
            +
                            id_ = path.split("/")[-1][: -len(".flac")]
         | 
| 104 | 
            +
                            audio_data[id_] = f.read()
         | 
| 105 | 
            +
                        elif path.endswith(".trans.txt"):
         | 
| 106 | 
            +
                            for line in f:
         | 
| 107 | 
            +
                                if line:
         | 
| 108 | 
            +
                                    line = line.decode("utf-8").strip()
         | 
| 109 | 
            +
                                    id_, transcript = line.split(" ", 1)
         | 
| 110 | 
            +
                                    audio_file = f"{id_}.flac"
         | 
| 111 | 
            +
                                    speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]]
         | 
| 112 | 
            +
                                    audio_file = (
         | 
| 113 | 
            +
                                        os.path.join(local_extracted_archive, audio_file)
         | 
| 114 | 
            +
                                        if local_extracted_archive
         | 
| 115 | 
            +
                                        else audio_file
         | 
| 116 | 
            +
                                    )
         | 
| 117 | 
            +
                                    transcripts.append(
         | 
| 118 | 
            +
                                        {
         | 
| 119 | 
            +
                                            "id": id_,
         | 
| 120 | 
            +
                                            "speaker_id": speaker_id,
         | 
| 121 | 
            +
                                            "chapter_id": chapter_id,
         | 
| 122 | 
            +
                                            "file": audio_file,
         | 
| 123 | 
            +
                                            "text": transcript,
         | 
| 124 | 
            +
                                        }
         | 
| 125 | 
            +
                                    )
         | 
| 126 | 
            +
                        if audio_data and len(audio_data) == len(transcripts):
         | 
| 127 | 
            +
                            for transcript in transcripts:
         | 
| 128 | 
            +
                                audio = {"path": transcript["file"], "bytes": audio_data[transcript["id"]]}
         | 
| 129 | 
            +
                                yield key, {"audio": audio, **transcript}
         | 
| 130 | 
            +
                                key += 1
         | 
| 131 | 
            +
                            audio_data = {}
         | 
| 132 | 
            +
                            transcripts = []
         | 
