Sin2pi commited on
Commit
2f0606b
·
verified ·
1 Parent(s): 6296a84

Update librispeech_asr.py

Browse files
Files changed (1) hide show
  1. librispeech_asr.py +148 -132
librispeech_asr.py CHANGED
@@ -1,132 +1,148 @@
1
-
2
- """Librispeech automatic speech recognition dataset."""
3
-
4
- import os
5
-
6
- import datasets
7
-
8
- _CITATION = """\
9
- @inproceedings{panayotov2015librispeech,
10
- title={Librispeech: an ASR corpus based on public domain audio books},
11
- author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
12
- booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},
13
- pages={5206--5210},
14
- year={2015},
15
- organization={IEEE}
16
- }
17
- """
18
-
19
- _DESCRIPTION = """\
20
- LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,
21
- prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read
22
- audiobooks from the LibriVox project, and has been carefully segmented and aligned.87
23
- """
24
-
25
- _URL = "http://www.openslr.org/12"
26
- _DL_URL = "http://www.openslr.org/resources/12/"
27
-
28
- _DL_URLS = {"test": _DL_URL + "test-clean.tar.gz",
29
- "train.100": _DL_URL + "train-clean-100.tar.gz",
30
- }
31
-
32
- class LibrispeechASRConfig(datasets.BuilderConfig):
33
- """BuilderConfig for LibriSpeechASR."""
34
-
35
- def __init__(self, **kwargs):
36
- """
37
- Args:
38
- data_dir: `string`, the path to the folder containing the files in the
39
- downloaded .tar
40
- citation: `string`, citation for the data set
41
- url: `string`, url for information about the data set
42
- **kwargs: keyword arguments forwarded to super.
43
- """
44
- super(LibrispeechASRConfig, self).__init__(version=datasets.Version("2.1.0", ""), **kwargs)
45
-
46
- class LibrispeechASR(datasets.GeneratorBasedBuilder):
47
- """Librispeech dataset."""
48
-
49
- DEFAULT_WRITER_BATCH_SIZE = 256
50
- DEFAULT_CONFIG_NAME = "all"
51
- BUILDER_CONFIG = LibrispeechASRConfig(name="clean", description="'Clean' speech.")
52
-
53
- def _info(self):
54
- return datasets.DatasetInfo(
55
- description=_DESCRIPTION,
56
- features=datasets.Features(
57
- {
58
- "file": datasets.Value("string"),
59
- "audio": datasets.Audio(sampling_rate=16_000),
60
- "text": datasets.Value("string"),
61
- "speaker_id": datasets.Value("int64"),
62
- "chapter_id": datasets.Value("int64"),
63
- "id": datasets.Value("string"),
64
- }
65
- ),
66
- supervised_keys=("file", "text"),
67
- homepage=_URL,
68
- citation=_CITATION,
69
- )
70
-
71
- def _split_generators(self, dl_manager):
72
- archive_path = dl_manager.download(_DL_URLS)
73
- # (Optional) In non-streaming mode, we can extract the archive locally to have actual local audio files:
74
- local_extracted_archive = dl_manager.extract(archive_path) if not dl_manager.is_streaming else {}
75
-
76
- train_split = [
77
- datasets.SplitGenerator(
78
- name="train.100",
79
- gen_kwargs={
80
- "local_extracted_archive": local_extracted_archive.get("train.100"),
81
- "files": dl_manager.iter_archive(archive_path["train.100"]),
82
- },
83
- ),
84
- ]
85
- test_split = [
86
- datasets.SplitGenerator(
87
- name=datasets.Split.TEST,
88
- gen_kwargs={
89
- "local_extracted_archive": local_extracted_archive.get("test"),
90
- "files": dl_manager.iter_archive(archive_path["test"]),
91
- },
92
- )
93
- ]
94
- return train_split + test_split
95
-
96
- def _generate_examples(self, files, local_extracted_archive):
97
- """Generate examples from a LibriSpeech archive_path."""
98
- key = 0
99
- audio_data = {}
100
- transcripts = []
101
- for path, f in files:
102
- if path.endswith(".flac"):
103
- id_ = path.split("/")[-1][: -len(".flac")]
104
- audio_data[id_] = f.read()
105
- elif path.endswith(".trans.txt"):
106
- for line in f:
107
- if line:
108
- line = line.decode("utf-8").strip()
109
- id_, transcript = line.split(" ", 1)
110
- audio_file = f"{id_}.flac"
111
- speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]]
112
- audio_file = (
113
- os.path.join(local_extracted_archive, audio_file)
114
- if local_extracted_archive
115
- else audio_file
116
- )
117
- transcripts.append(
118
- {
119
- "id": id_,
120
- "speaker_id": speaker_id,
121
- "chapter_id": chapter_id,
122
- "file": audio_file,
123
- "text": transcript,
124
- }
125
- )
126
- if audio_data and len(audio_data) == len(transcripts):
127
- for transcript in transcripts:
128
- audio = {"path": transcript["file"], "bytes": audio_data[transcript["id"]]}
129
- yield key, {"audio": audio, **transcript}
130
- key += 1
131
- audio_data = {}
132
- transcripts = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import datasets
3
+ import glob
4
+
5
+ # hugging face sucks
6
+
7
+ # import aiohttp
8
+ # dataset = load_dataset(
9
+ # "librispeech_asr",
10
+ # "clean",
11
+ # split=["train_clean_100", "test_clean"],
12
+ # cache_dir=CACHE_DIR,
13
+ # storage_options={'client_kwargs': {'timeout': aiohttp.ClientTimeout(total=3600, connect=60)}}
14
+ # ).rename_column("text", "transcription").cast_column("audio", Audio(sampling_rate=sample_rate))
15
+
16
+ _URL = "http://www.openslr.org/12"
17
+ _DL_URL = "http://www.openslr.org/resources/12/"
18
+
19
+
20
+ _DL_URLS = {
21
+ "clean": {
22
+ "train_clean_100": _DL_URL + "train-clean-100.tar.gz",
23
+ "test_clean": _DL_URL + "test-clean.tar.gz",
24
+ "dev_clean": _DL_URL + "dev-clean.tar.gz",
25
+ },
26
+ }
27
+
28
+
29
+ class LibrispeechASRConfig(datasets.BuilderConfig):
30
+ """BuilderConfig for LibriSpeechASR."""
31
+
32
+ def __init__(self, **kwargs):
33
+ super(LibrispeechASRConfig, self).__init__(version=datasets.Version("2.1.0", ""), **kwargs)
34
+
35
+
36
+ class LibrispeechASR(datasets.GeneratorBasedBuilder):
37
+ """Librispeech dataset."""
38
+
39
+ DEFAULT_WRITER_BATCH_SIZE = 256
40
+ DEFAULT_CONFIG_NAME = "clean"
41
+ BUILDER_CONFIGS = [
42
+ LibrispeechASRConfig(name="clean", description="'Clean' speech."),
43
+ ]
44
+
45
+ def _info(self):
46
+ return datasets.DatasetInfo(
47
+ features=datasets.Features(
48
+ {
49
+ "file": datasets.Value("string"),
50
+ "audio": datasets.Audio(sampling_rate=16_000),
51
+ "text": datasets.Value("string"),
52
+ "speaker_id": datasets.Value("int64"),
53
+ "chapter_id": datasets.Value("int64"),
54
+ "id": datasets.Value("string"),
55
+ }
56
+ ),
57
+ supervised_keys=("file", "text"),
58
+ homepage=_URL,
59
+ )
60
+
61
+ def _split_generators(self, dl_manager):
62
+ if self.config.name == "clean":
63
+ urls_to_download = {
64
+ "train_clean_100": _DL_URLS["clean"]["train_clean_100"],
65
+ "test_clean": _DL_URLS["clean"]["test_clean"],
66
+ }
67
+ else:
68
+ raise ValueError(f"Configuration '{self.config.name}' not supported in this script version.")
69
+
70
+ archive_path = dl_manager.download(urls_to_download)
71
+ local_extracted_archive = dl_manager.extract(archive_path)
72
+
73
+ splits = []
74
+
75
+ splits.append(
76
+ datasets.SplitGenerator(
77
+ name="train_clean_100",
78
+ gen_kwargs={
79
+ "path_to_extracted_archive": local_extracted_archive["train_clean_100"],
80
+ },
81
+ )
82
+ )
83
+
84
+ splits.append(
85
+ datasets.SplitGenerator(
86
+ name="test_clean",
87
+ gen_kwargs={
88
+ "path_to_extracted_archive": local_extracted_archive["test_clean"],
89
+ },
90
+ )
91
+ )
92
+
93
+ return splits
94
+
95
+ def _generate_examples(self, path_to_extracted_archive):
96
+
97
+ key = 0
98
+
99
+ transcription_files = glob.glob(os.path.join(path_to_extracted_archive, "**", "*.trans.txt"), recursive=True)
100
+
101
+ transcriptions_by_id = {}
102
+ for trans_file_path in transcription_files:
103
+ if "README.TXT" in os.path.basename(trans_file_path).upper():
104
+ continue
105
+
106
+ with open(trans_file_path, "r", encoding="utf-8") as f:
107
+ for line in f:
108
+ line = line.strip()
109
+ if line:
110
+ parts = line.split(" ", 1)
111
+ if len(parts) == 2:
112
+ utt_id, transcript = parts
113
+ transcriptions_by_id[utt_id] = transcript
114
+ else:
115
+ print(f"Warning: Skipping malformed line in {trans_file_path}: {line}")
116
+
117
+ audio_files = glob.glob(os.path.join(path_to_extracted_archive, "**", "*.flac"), recursive=True)
118
+
119
+ for audio_file_path in audio_files:
120
+ file_basename = os.path.basename(audio_file_path)
121
+ utt_id_from_file = file_basename[: -len(".flac")]
122
+
123
+ if utt_id_from_file in transcriptions_by_id:
124
+ transcript = transcriptions_by_id[utt_id_from_file]
125
+
126
+ parts = utt_id_from_file.split("-")
127
+ if len(parts) >= 2:
128
+ try:
129
+ speaker_id = int(parts[0])
130
+ chapter_id = int(parts[1])
131
+ except ValueError:
132
+ print(f"Warning: Skipping utterance {utt_id_from_file} due to non-integer speaker/chapter ID.")
133
+ continue
134
+ else:
135
+ print(f"Warning: Skipping utterance {utt_id_from_file} due to unexpected ID format.")
136
+ continue
137
+
138
+ yield key, {
139
+ "file": audio_file_path,
140
+ "audio": audio_file_path,
141
+ "text": transcript,
142
+ "speaker_id": speaker_id,
143
+ "chapter_id": chapter_id,
144
+ "id": utt_id_from_file,
145
+ }
146
+ key += 1
147
+ else:
148
+ print(f"Warning: No transcription found for audio file {audio_file_path}. Skipping.")