datasetsANDmodels commited on
Commit
5fa4594
·
verified ·
1 Parent(s): 1c0c89a

Upload 4 files

Browse files
Files changed (4) hide show
  1. pretrained_vi.onnx +3 -0
  2. pretrained_vi.onnx.json +492 -0
  3. requirements.txt +4 -0
  4. tts.py +226 -0
pretrained_vi.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c21eafdc108a392331d8237da0e5d52ecd44cd6003de963b8187cbfdc7e82d0
3
+ size 63122309
pretrained_vi.onnx.json ADDED
@@ -0,0 +1,492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio": {
3
+ "sample_rate": 22050
4
+ },
5
+ "espeak": {
6
+ "voice": "vi"
7
+ },
8
+ "inference": {
9
+ "noise_scale": 0.667,
10
+ "length_scale": 1,
11
+ "noise_w": 0.8
12
+ },
13
+ "phoneme_type": "espeak",
14
+ "phoneme_map": {},
15
+ "phoneme_id_map": {
16
+ " ": [
17
+ 3
18
+ ],
19
+ "!": [
20
+ 4
21
+ ],
22
+ "\"": [
23
+ 150
24
+ ],
25
+ "#": [
26
+ 149
27
+ ],
28
+ "$": [
29
+ 2
30
+ ],
31
+ "'": [
32
+ 5
33
+ ],
34
+ "(": [
35
+ 6
36
+ ],
37
+ ")": [
38
+ 7
39
+ ],
40
+ ",": [
41
+ 8
42
+ ],
43
+ "-": [
44
+ 9
45
+ ],
46
+ ".": [
47
+ 10
48
+ ],
49
+ "0": [
50
+ 130
51
+ ],
52
+ "1": [
53
+ 131
54
+ ],
55
+ "2": [
56
+ 132
57
+ ],
58
+ "3": [
59
+ 133
60
+ ],
61
+ "4": [
62
+ 134
63
+ ],
64
+ "5": [
65
+ 135
66
+ ],
67
+ "6": [
68
+ 136
69
+ ],
70
+ "7": [
71
+ 137
72
+ ],
73
+ "8": [
74
+ 138
75
+ ],
76
+ "9": [
77
+ 139
78
+ ],
79
+ ":": [
80
+ 11
81
+ ],
82
+ ";": [
83
+ 12
84
+ ],
85
+ "?": [
86
+ 13
87
+ ],
88
+ "X": [
89
+ 156
90
+ ],
91
+ "^": [
92
+ 1
93
+ ],
94
+ "_": [
95
+ 0
96
+ ],
97
+ "a": [
98
+ 14
99
+ ],
100
+ "b": [
101
+ 15
102
+ ],
103
+ "c": [
104
+ 16
105
+ ],
106
+ "d": [
107
+ 17
108
+ ],
109
+ "e": [
110
+ 18
111
+ ],
112
+ "f": [
113
+ 19
114
+ ],
115
+ "g": [
116
+ 154
117
+ ],
118
+ "h": [
119
+ 20
120
+ ],
121
+ "i": [
122
+ 21
123
+ ],
124
+ "j": [
125
+ 22
126
+ ],
127
+ "k": [
128
+ 23
129
+ ],
130
+ "l": [
131
+ 24
132
+ ],
133
+ "m": [
134
+ 25
135
+ ],
136
+ "n": [
137
+ 26
138
+ ],
139
+ "o": [
140
+ 27
141
+ ],
142
+ "p": [
143
+ 28
144
+ ],
145
+ "q": [
146
+ 29
147
+ ],
148
+ "r": [
149
+ 30
150
+ ],
151
+ "s": [
152
+ 31
153
+ ],
154
+ "t": [
155
+ 32
156
+ ],
157
+ "u": [
158
+ 33
159
+ ],
160
+ "v": [
161
+ 34
162
+ ],
163
+ "w": [
164
+ 35
165
+ ],
166
+ "x": [
167
+ 36
168
+ ],
169
+ "y": [
170
+ 37
171
+ ],
172
+ "z": [
173
+ 38
174
+ ],
175
+ "æ": [
176
+ 39
177
+ ],
178
+ "ç": [
179
+ 40
180
+ ],
181
+ "ð": [
182
+ 41
183
+ ],
184
+ "ø": [
185
+ 42
186
+ ],
187
+ "ħ": [
188
+ 43
189
+ ],
190
+ "ŋ": [
191
+ 44
192
+ ],
193
+ "œ": [
194
+ 45
195
+ ],
196
+ "ǀ": [
197
+ 46
198
+ ],
199
+ "ǁ": [
200
+ 47
201
+ ],
202
+ "ǂ": [
203
+ 48
204
+ ],
205
+ "ǃ": [
206
+ 49
207
+ ],
208
+ "ɐ": [
209
+ 50
210
+ ],
211
+ "ɑ": [
212
+ 51
213
+ ],
214
+ "ɒ": [
215
+ 52
216
+ ],
217
+ "ɓ": [
218
+ 53
219
+ ],
220
+ "ɔ": [
221
+ 54
222
+ ],
223
+ "ɕ": [
224
+ 55
225
+ ],
226
+ "ɖ": [
227
+ 56
228
+ ],
229
+ "ɗ": [
230
+ 57
231
+ ],
232
+ "ɘ": [
233
+ 58
234
+ ],
235
+ "ə": [
236
+ 59
237
+ ],
238
+ "ɚ": [
239
+ 60
240
+ ],
241
+ "ɛ": [
242
+ 61
243
+ ],
244
+ "ɜ": [
245
+ 62
246
+ ],
247
+ "ɞ": [
248
+ 63
249
+ ],
250
+ "ɟ": [
251
+ 64
252
+ ],
253
+ "ɠ": [
254
+ 65
255
+ ],
256
+ "ɡ": [
257
+ 66
258
+ ],
259
+ "ɢ": [
260
+ 67
261
+ ],
262
+ "ɣ": [
263
+ 68
264
+ ],
265
+ "ɤ": [
266
+ 69
267
+ ],
268
+ "ɥ": [
269
+ 70
270
+ ],
271
+ "ɦ": [
272
+ 71
273
+ ],
274
+ "ɧ": [
275
+ 72
276
+ ],
277
+ "ɨ": [
278
+ 73
279
+ ],
280
+ "ɪ": [
281
+ 74
282
+ ],
283
+ "ɫ": [
284
+ 75
285
+ ],
286
+ "ɬ": [
287
+ 76
288
+ ],
289
+ "ɭ": [
290
+ 77
291
+ ],
292
+ "ɮ": [
293
+ 78
294
+ ],
295
+ "ɯ": [
296
+ 79
297
+ ],
298
+ "ɰ": [
299
+ 80
300
+ ],
301
+ "ɱ": [
302
+ 81
303
+ ],
304
+ "ɲ": [
305
+ 82
306
+ ],
307
+ "ɳ": [
308
+ 83
309
+ ],
310
+ "ɴ": [
311
+ 84
312
+ ],
313
+ "ɵ": [
314
+ 85
315
+ ],
316
+ "ɶ": [
317
+ 86
318
+ ],
319
+ "ɸ": [
320
+ 87
321
+ ],
322
+ "ɹ": [
323
+ 88
324
+ ],
325
+ "ɺ": [
326
+ 89
327
+ ],
328
+ "ɻ": [
329
+ 90
330
+ ],
331
+ "ɽ": [
332
+ 91
333
+ ],
334
+ "ɾ": [
335
+ 92
336
+ ],
337
+ "ʀ": [
338
+ 93
339
+ ],
340
+ "ʁ": [
341
+ 94
342
+ ],
343
+ "ʂ": [
344
+ 95
345
+ ],
346
+ "ʃ": [
347
+ 96
348
+ ],
349
+ "ʄ": [
350
+ 97
351
+ ],
352
+ "ʈ": [
353
+ 98
354
+ ],
355
+ "ʉ": [
356
+ 99
357
+ ],
358
+ "ʊ": [
359
+ 100
360
+ ],
361
+ "ʋ": [
362
+ 101
363
+ ],
364
+ "ʌ": [
365
+ 102
366
+ ],
367
+ "ʍ": [
368
+ 103
369
+ ],
370
+ "ʎ": [
371
+ 104
372
+ ],
373
+ "ʏ": [
374
+ 105
375
+ ],
376
+ "ʐ": [
377
+ 106
378
+ ],
379
+ "ʑ": [
380
+ 107
381
+ ],
382
+ "ʒ": [
383
+ 108
384
+ ],
385
+ "ʔ": [
386
+ 109
387
+ ],
388
+ "ʕ": [
389
+ 110
390
+ ],
391
+ "ʘ": [
392
+ 111
393
+ ],
394
+ "ʙ": [
395
+ 112
396
+ ],
397
+ "ʛ": [
398
+ 113
399
+ ],
400
+ "ʜ": [
401
+ 114
402
+ ],
403
+ "ʝ": [
404
+ 115
405
+ ],
406
+ "ʟ": [
407
+ 116
408
+ ],
409
+ "ʡ": [
410
+ 117
411
+ ],
412
+ "ʢ": [
413
+ 118
414
+ ],
415
+ "ʦ": [
416
+ 155
417
+ ],
418
+ "ʰ": [
419
+ 145
420
+ ],
421
+ "ʲ": [
422
+ 119
423
+ ],
424
+ "ˈ": [
425
+ 120
426
+ ],
427
+ "ˌ": [
428
+ 121
429
+ ],
430
+ "ː": [
431
+ 122
432
+ ],
433
+ "ˑ": [
434
+ 123
435
+ ],
436
+ "˞": [
437
+ 124
438
+ ],
439
+ "ˤ": [
440
+ 146
441
+ ],
442
+ "̃": [
443
+ 141
444
+ ],
445
+ "̧": [
446
+ 140
447
+ ],
448
+ "̩": [
449
+ 144
450
+ ],
451
+ "̪": [
452
+ 142
453
+ ],
454
+ "̯": [
455
+ 143
456
+ ],
457
+ "̺": [
458
+ 152
459
+ ],
460
+ "̻": [
461
+ 153
462
+ ],
463
+ "β": [
464
+ 125
465
+ ],
466
+ "ε": [
467
+ 147
468
+ ],
469
+ "θ": [
470
+ 126
471
+ ],
472
+ "χ": [
473
+ 127
474
+ ],
475
+ "ᵻ": [
476
+ 128
477
+ ],
478
+ "↑": [
479
+ 151
480
+ ],
481
+ "↓": [
482
+ 148
483
+ ],
484
+ "ⱱ": [
485
+ 129
486
+ ]
487
+ },
488
+ "num_symbols": 256,
489
+ "num_speakers": 1,
490
+ "speaker_id_map": {},
491
+ "piper_version": "1.0.0"
492
+ }
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ tornado==6.3.3
2
+ onnxruntime==1.15.1
3
+ piper-phonemize==1.1.0
4
+ jsonschema==4.19.1
tts.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # !pip install xtorch==1.11.0
2
+ # !pip install onnxruntime==1.15.1
3
+ # !pip install piper-phonemize==1.1.0
4
+ import re
5
+ import json
6
+ import math
7
+ import time
8
+ from pathlib import Path
9
+ from enum import Enum
10
+ import os
11
+ import numpy as np
12
+ import onnxruntime
13
+ import soundfile as sf
14
+ from typing import List
15
+
16
+ from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run
17
+ model_name = "pretrained_vi.onnx"
18
+ SPEED_VALUES = {"very_slow":1.5,
19
+ "slow":1.2,
20
+ "normal":1,
21
+ "fast":0.6,
22
+ "very_fast":0.4}
23
+ SAMPLE_RATE = 22050
24
+ NOISE_SCALE_W = 0.8
25
+ NOISE_SCALE = 0.667
26
+ PAD = "_" # padding (0)
27
+ BOS = "^" # beginning of sentence
28
+ EOS = "$" # end of sentence
29
+
30
+
31
+ def text_to_speech(text:str,speed:str, output_path):
32
+ speed = speed.strip()
33
+ length_scale = float(SPEED_VALUES[speed])
34
+ sess_options = onnxruntime.SessionOptions()
35
+ model = onnxruntime.InferenceSession(model_name, sess_options=sess_options)
36
+ config = load_config(model_name)
37
+ text = text.strip()
38
+ phonemes_list = phonemize(config, text)
39
+ phoneme_ids = []
40
+ for phonemes in phonemes_list:
41
+ phoneme_ids.append(phonemes_to_ids(config, phonemes))
42
+
43
+ speaker_id = None
44
+ phoneme_ids_flatten = []
45
+ for i in phoneme_ids:
46
+ phoneme_ids_flatten += i + [0,0,0]
47
+ text = np.expand_dims(np.array(phoneme_ids_flatten, dtype=np.int64), 0)
48
+ text_lengths = np.array([text.shape[1]], dtype=np.int64)
49
+ scales = np.array(
50
+ [NOISE_SCALE, length_scale, NOISE_SCALE_W],
51
+ dtype=np.float32,
52
+ )
53
+ sid = None
54
+
55
+ if speaker_id is not None:
56
+ sid = np.array([speaker_id], dtype=np.int64)
57
+
58
+ start_time = time.perf_counter()
59
+ audio = model.run(
60
+ None,
61
+ {
62
+ "input": text,
63
+ "input_lengths": text_lengths,
64
+ "scales": scales,
65
+ "sid": sid,
66
+ },
67
+ )[0].squeeze((0, 1))
68
+ audio = audio_float_to_int16(audio.squeeze())
69
+ end_time = time.perf_counter()
70
+
71
+ audio_duration_sec = audio.shape[-1] / SAMPLE_RATE
72
+ infer_sec = end_time - start_time
73
+ sf.write(str(output_path),audio, SAMPLE_RATE)
74
+ return output_path
75
+
76
+
77
+
78
+ def audio_float_to_int16(
79
+ audio: np.ndarray, max_wav_value: float = 32767.0
80
+ ) -> np.ndarray:
81
+ """Normalize audio and convert to int16 range"""
82
+ audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
83
+ audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
84
+ audio_norm = audio_norm.astype("int16")
85
+ return audio_norm
86
+
87
+ class PhonemeType(str, Enum):
88
+ ESPEAK = "espeak"
89
+ TEXT = "text"
90
+
91
+ def phonemize(config, text: str) -> List[List[str]]:
92
+ """Text to phonemes grouped by sentence."""
93
+ if config["phoneme_type"] == PhonemeType.ESPEAK:
94
+ if config["espeak"]["voice"] == "ar":
95
+ # Arabic diacritization
96
+ # https://github.com/mush42/libtashkeel/
97
+ text = tashkeel_run(text)
98
+ return phonemize_espeak(text, config["espeak"]["voice"])
99
+ if config["phoneme_type"] == PhonemeType.TEXT:
100
+ return phonemize_codepoints(text)
101
+ raise ValueError(f'Unexpected phoneme type: {config["phoneme_type"]}')
102
+
103
+
104
+ def phonemes_to_ids(config, phonemes: List[str]) -> List[int]:
105
+ """Phonemes to ids."""
106
+ id_map = config["phoneme_id_map"]
107
+ ids: List[int] = list(id_map[BOS])
108
+ for phoneme in phonemes:
109
+ if phoneme not in id_map:
110
+ print("Missing phoneme from id map: %s", phoneme)
111
+ continue
112
+ ids.extend(id_map[phoneme])
113
+ ids.extend(id_map[PAD])
114
+ ids.extend(id_map[EOS])
115
+ return ids
116
+
117
+ def load_config(model):
118
+ with open(f"{model}.json", "r") as file:
119
+ config = json.load(file)
120
+ return config
121
+
122
+ def denoise(
123
+ audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float
124
+ ) -> np.ndarray:
125
+ audio_spec, audio_angles = transform(audio)
126
+
127
+ a = bias_spec.shape[-1]
128
+ b = audio_spec.shape[-1]
129
+ repeats = max(1, math.ceil(b / a))
130
+ bias_spec_repeat = np.repeat(bias_spec, repeats, axis=-1)[..., :b]
131
+
132
+ audio_spec_denoised = audio_spec - (bias_spec_repeat * denoiser_strength)
133
+ audio_spec_denoised = np.clip(audio_spec_denoised, a_min=0.0, a_max=None)
134
+ audio_denoised = inverse(audio_spec_denoised, audio_angles)
135
+
136
+ return audio_denoised
137
+
138
+
139
+ def stft(x, fft_size, hopsamp):
140
+ """Compute and return the STFT of the supplied time domain signal x.
141
+ Args:
142
+ x (1-dim Numpy array): A time domain signal.
143
+ fft_size (int): FFT size. Should be a power of 2, otherwise DFT will be used.
144
+ hopsamp (int):
145
+ Returns:
146
+ The STFT. The rows are the time slices and columns are the frequency bins.
147
+ """
148
+ window = np.hanning(fft_size)
149
+ fft_size = int(fft_size)
150
+ hopsamp = int(hopsamp)
151
+ return np.array(
152
+ [
153
+ np.fft.rfft(window * x[i : i + fft_size])
154
+ for i in range(0, len(x) - fft_size, hopsamp)
155
+ ]
156
+ )
157
+
158
+
159
+ def istft(X, fft_size, hopsamp):
160
+ """Invert a STFT into a time domain signal.
161
+ Args:
162
+ X (2-dim Numpy array): Input spectrogram. The rows are the time slices and columns are the frequency bins.
163
+ fft_size (int):
164
+ hopsamp (int): The hop size, in samples.
165
+ Returns:
166
+ The inverse STFT.
167
+ """
168
+ fft_size = int(fft_size)
169
+ hopsamp = int(hopsamp)
170
+ window = np.hanning(fft_size)
171
+ time_slices = X.shape[0]
172
+ len_samples = int(time_slices * hopsamp + fft_size)
173
+ x = np.zeros(len_samples)
174
+ for n, i in enumerate(range(0, len(x) - fft_size, hopsamp)):
175
+ x[i : i + fft_size] += window * np.real(np.fft.irfft(X[n]))
176
+ return x
177
+
178
+
179
+ def inverse(magnitude, phase):
180
+ recombine_magnitude_phase = np.concatenate(
181
+ [magnitude * np.cos(phase), magnitude * np.sin(phase)], axis=1
182
+ )
183
+
184
+ x_org = recombine_magnitude_phase
185
+ n_b, n_f, n_t = x_org.shape # pylint: disable=unpacking-non-sequence
186
+ x = np.empty([n_b, n_f // 2, n_t], dtype=np.complex64)
187
+ x.real = x_org[:, : n_f // 2]
188
+ x.imag = x_org[:, n_f // 2 :]
189
+ inverse_transform = []
190
+ for y in x:
191
+ y_ = istft(y.T, fft_size=1024, hopsamp=256)
192
+ inverse_transform.append(y_[None, :])
193
+
194
+ inverse_transform = np.concatenate(inverse_transform, 0)
195
+
196
+ return inverse_transform
197
+
198
+
199
+ def transform(input_data):
200
+ x = input_data
201
+ real_part = []
202
+ imag_part = []
203
+ for y in x:
204
+ y_ = stft(y, fft_size=1024, hopsamp=256).T
205
+ real_part.append(y_.real[None, :, :]) # pylint: disable=unsubscriptable-object
206
+ imag_part.append(y_.imag[None, :, :]) # pylint: disable=unsubscriptable-object
207
+ real_part = np.concatenate(real_part, 0)
208
+ imag_part = np.concatenate(imag_part, 0)
209
+
210
+ magnitude = np.sqrt(real_part**2 + imag_part**2)
211
+ phase = np.arctan2(imag_part.data, real_part.data)
212
+
213
+ return magnitude, phase
214
+
215
+ def split_audio(sentences,speed):
216
+ sentences = [sentence.strip() for sentence in re.split(r'(?<=[.!?…;:—])\s+', text) if sentence]
217
+
218
+ for i, sentence in enumerate(sentences, 1):
219
+ output_file = f"output_speech_{i}.wav"
220
+ text_to_speech(sentence,speed ,output_file)
221
+ print(f"Generated: {output_file}")
222
+
223
+
224
+ text = "Tất cả mọi người đều sinh ra có quyền bình đẳng. Tạo hóa cho họ những quyền không ai có thể xâm phạm được."
225
+ speed = "normal"
226
+ split_audio(text,speed)