Update librispeech_asr.py
Browse files- librispeech_asr.py +148 -132
librispeech_asr.py
CHANGED
@@ -1,132 +1,148 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
""
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
"""
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import datasets
|
3 |
+
import glob
|
4 |
+
|
5 |
+
# hugging face sucks
|
6 |
+
|
7 |
+
# import aiohttp
|
8 |
+
# dataset = load_dataset(
|
9 |
+
# "librispeech_asr",
|
10 |
+
# "clean",
|
11 |
+
# split=["train_clean_100", "test_clean"],
|
12 |
+
# cache_dir=CACHE_DIR,
|
13 |
+
# storage_options={'client_kwargs': {'timeout': aiohttp.ClientTimeout(total=3600, connect=60)}}
|
14 |
+
# ).rename_column("text", "transcription").cast_column("audio", Audio(sampling_rate=sample_rate))
|
15 |
+
|
16 |
+
_URL = "http://www.openslr.org/12"
|
17 |
+
_DL_URL = "http://www.openslr.org/resources/12/"
|
18 |
+
|
19 |
+
|
20 |
+
_DL_URLS = {
|
21 |
+
"clean": {
|
22 |
+
"train_clean_100": _DL_URL + "train-clean-100.tar.gz",
|
23 |
+
"test_clean": _DL_URL + "test-clean.tar.gz",
|
24 |
+
"dev_clean": _DL_URL + "dev-clean.tar.gz",
|
25 |
+
},
|
26 |
+
}
|
27 |
+
|
28 |
+
|
29 |
+
class LibrispeechASRConfig(datasets.BuilderConfig):
|
30 |
+
"""BuilderConfig for LibriSpeechASR."""
|
31 |
+
|
32 |
+
def __init__(self, **kwargs):
|
33 |
+
super(LibrispeechASRConfig, self).__init__(version=datasets.Version("2.1.0", ""), **kwargs)
|
34 |
+
|
35 |
+
|
36 |
+
class LibrispeechASR(datasets.GeneratorBasedBuilder):
|
37 |
+
"""Librispeech dataset."""
|
38 |
+
|
39 |
+
DEFAULT_WRITER_BATCH_SIZE = 256
|
40 |
+
DEFAULT_CONFIG_NAME = "clean"
|
41 |
+
BUILDER_CONFIGS = [
|
42 |
+
LibrispeechASRConfig(name="clean", description="'Clean' speech."),
|
43 |
+
]
|
44 |
+
|
45 |
+
def _info(self):
|
46 |
+
return datasets.DatasetInfo(
|
47 |
+
features=datasets.Features(
|
48 |
+
{
|
49 |
+
"file": datasets.Value("string"),
|
50 |
+
"audio": datasets.Audio(sampling_rate=16_000),
|
51 |
+
"text": datasets.Value("string"),
|
52 |
+
"speaker_id": datasets.Value("int64"),
|
53 |
+
"chapter_id": datasets.Value("int64"),
|
54 |
+
"id": datasets.Value("string"),
|
55 |
+
}
|
56 |
+
),
|
57 |
+
supervised_keys=("file", "text"),
|
58 |
+
homepage=_URL,
|
59 |
+
)
|
60 |
+
|
61 |
+
def _split_generators(self, dl_manager):
|
62 |
+
if self.config.name == "clean":
|
63 |
+
urls_to_download = {
|
64 |
+
"train_clean_100": _DL_URLS["clean"]["train_clean_100"],
|
65 |
+
"test_clean": _DL_URLS["clean"]["test_clean"],
|
66 |
+
}
|
67 |
+
else:
|
68 |
+
raise ValueError(f"Configuration '{self.config.name}' not supported in this script version.")
|
69 |
+
|
70 |
+
archive_path = dl_manager.download(urls_to_download)
|
71 |
+
local_extracted_archive = dl_manager.extract(archive_path)
|
72 |
+
|
73 |
+
splits = []
|
74 |
+
|
75 |
+
splits.append(
|
76 |
+
datasets.SplitGenerator(
|
77 |
+
name="train_clean_100",
|
78 |
+
gen_kwargs={
|
79 |
+
"path_to_extracted_archive": local_extracted_archive["train_clean_100"],
|
80 |
+
},
|
81 |
+
)
|
82 |
+
)
|
83 |
+
|
84 |
+
splits.append(
|
85 |
+
datasets.SplitGenerator(
|
86 |
+
name="test_clean",
|
87 |
+
gen_kwargs={
|
88 |
+
"path_to_extracted_archive": local_extracted_archive["test_clean"],
|
89 |
+
},
|
90 |
+
)
|
91 |
+
)
|
92 |
+
|
93 |
+
return splits
|
94 |
+
|
95 |
+
def _generate_examples(self, path_to_extracted_archive):
|
96 |
+
|
97 |
+
key = 0
|
98 |
+
|
99 |
+
transcription_files = glob.glob(os.path.join(path_to_extracted_archive, "**", "*.trans.txt"), recursive=True)
|
100 |
+
|
101 |
+
transcriptions_by_id = {}
|
102 |
+
for trans_file_path in transcription_files:
|
103 |
+
if "README.TXT" in os.path.basename(trans_file_path).upper():
|
104 |
+
continue
|
105 |
+
|
106 |
+
with open(trans_file_path, "r", encoding="utf-8") as f:
|
107 |
+
for line in f:
|
108 |
+
line = line.strip()
|
109 |
+
if line:
|
110 |
+
parts = line.split(" ", 1)
|
111 |
+
if len(parts) == 2:
|
112 |
+
utt_id, transcript = parts
|
113 |
+
transcriptions_by_id[utt_id] = transcript
|
114 |
+
else:
|
115 |
+
print(f"Warning: Skipping malformed line in {trans_file_path}: {line}")
|
116 |
+
|
117 |
+
audio_files = glob.glob(os.path.join(path_to_extracted_archive, "**", "*.flac"), recursive=True)
|
118 |
+
|
119 |
+
for audio_file_path in audio_files:
|
120 |
+
file_basename = os.path.basename(audio_file_path)
|
121 |
+
utt_id_from_file = file_basename[: -len(".flac")]
|
122 |
+
|
123 |
+
if utt_id_from_file in transcriptions_by_id:
|
124 |
+
transcript = transcriptions_by_id[utt_id_from_file]
|
125 |
+
|
126 |
+
parts = utt_id_from_file.split("-")
|
127 |
+
if len(parts) >= 2:
|
128 |
+
try:
|
129 |
+
speaker_id = int(parts[0])
|
130 |
+
chapter_id = int(parts[1])
|
131 |
+
except ValueError:
|
132 |
+
print(f"Warning: Skipping utterance {utt_id_from_file} due to non-integer speaker/chapter ID.")
|
133 |
+
continue
|
134 |
+
else:
|
135 |
+
print(f"Warning: Skipping utterance {utt_id_from_file} due to unexpected ID format.")
|
136 |
+
continue
|
137 |
+
|
138 |
+
yield key, {
|
139 |
+
"file": audio_file_path,
|
140 |
+
"audio": audio_file_path,
|
141 |
+
"text": transcript,
|
142 |
+
"speaker_id": speaker_id,
|
143 |
+
"chapter_id": chapter_id,
|
144 |
+
"id": utt_id_from_file,
|
145 |
+
}
|
146 |
+
key += 1
|
147 |
+
else:
|
148 |
+
print(f"Warning: No transcription found for audio file {audio_file_path}. Skipping.")
|