Upload 4 files

Browse files

Files changed (4) hide show

pretrained_vi.onnx +3 -0
pretrained_vi.onnx.json +492 -0
requirements.txt +4 -0
tts.py +226 -0

pretrained_vi.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c21eafdc108a392331d8237da0e5d52ecd44cd6003de963b8187cbfdc7e82d0
+size 63122309

pretrained_vi.onnx.json ADDED Viewed

	@@ -0,0 +1,492 @@

+{
+    "audio": {
+        "sample_rate": 22050
+    },
+    "espeak": {
+        "voice": "vi"
+    },
+    "inference": {
+        "noise_scale": 0.667,
+        "length_scale": 1,
+        "noise_w": 0.8
+    },
+    "phoneme_type": "espeak",
+    "phoneme_map": {},
+    "phoneme_id_map": {
+        " ": [
+            3
+        ],
+        "!": [
+            4
+        ],
+        "\"": [
+            150
+        ],
+        "#": [
+            149
+        ],
+        "$": [
+            2
+        ],
+        "'": [
+            5
+        ],
+        "(": [
+            6
+        ],
+        ")": [
+            7
+        ],
+        ",": [
+            8
+        ],
+        "-": [
+            9
+        ],
+        ".": [
+            10
+        ],
+        "0": [
+            130
+        ],
+        "1": [
+            131
+        ],
+        "2": [
+            132
+        ],
+        "3": [
+            133
+        ],
+        "4": [
+            134
+        ],
+        "5": [
+            135
+        ],
+        "6": [
+            136
+        ],
+        "7": [
+            137
+        ],
+        "8": [
+            138
+        ],
+        "9": [
+            139
+        ],
+        ":": [
+            11
+        ],
+        ";": [
+            12
+        ],
+        "?": [
+            13
+        ],
+        "X": [
+            156
+        ],
+        "^": [
+            1
+        ],
+        "_": [
+            0
+        ],
+        "a": [
+            14
+        ],
+        "b": [
+            15
+        ],
+        "c": [
+            16
+        ],
+        "d": [
+            17
+        ],
+        "e": [
+            18
+        ],
+        "f": [
+            19
+        ],
+        "g": [
+            154
+        ],
+        "h": [
+            20
+        ],
+        "i": [
+            21
+        ],
+        "j": [
+            22
+        ],
+        "k": [
+            23
+        ],
+        "l": [
+            24
+        ],
+        "m": [
+            25
+        ],
+        "n": [
+            26
+        ],
+        "o": [
+            27
+        ],
+        "p": [
+            28
+        ],
+        "q": [
+            29
+        ],
+        "r": [
+            30
+        ],
+        "s": [
+            31
+        ],
+        "t": [
+            32
+        ],
+        "u": [
+            33
+        ],
+        "v": [
+            34
+        ],
+        "w": [
+            35
+        ],
+        "x": [
+            36
+        ],
+        "y": [
+            37
+        ],
+        "z": [
+            38
+        ],
+        "æ": [
+            39
+        ],
+        "ç": [
+            40
+        ],
+        "ð": [
+            41
+        ],
+        "ø": [
+            42
+        ],
+        "ħ": [
+            43
+        ],
+        "ŋ": [
+            44
+        ],
+        "œ": [
+            45
+        ],
+        "ǀ": [
+            46
+        ],
+        "ǁ": [
+            47
+        ],
+        "ǂ": [
+            48
+        ],
+        "ǃ": [
+            49
+        ],
+        "ɐ": [
+            50
+        ],
+        "ɑ": [
+            51
+        ],
+        "ɒ": [
+            52
+        ],
+        "ɓ": [
+            53
+        ],
+        "ɔ": [
+            54
+        ],
+        "ɕ": [
+            55
+        ],
+        "ɖ": [
+            56
+        ],
+        "ɗ": [
+            57
+        ],
+        "ɘ": [
+            58
+        ],
+        "ə": [
+            59
+        ],
+        "ɚ": [
+            60
+        ],
+        "ɛ": [
+            61
+        ],
+        "ɜ": [
+            62
+        ],
+        "ɞ": [
+            63
+        ],
+        "ɟ": [
+            64
+        ],
+        "ɠ": [
+            65
+        ],
+        "ɡ": [
+            66
+        ],
+        "ɢ": [
+            67
+        ],
+        "ɣ": [
+            68
+        ],
+        "ɤ": [
+            69
+        ],
+        "ɥ": [
+            70
+        ],
+        "ɦ": [
+            71
+        ],
+        "ɧ": [
+            72
+        ],
+        "ɨ": [
+            73
+        ],
+        "ɪ": [
+            74
+        ],
+        "ɫ": [
+            75
+        ],
+        "ɬ": [
+            76
+        ],
+        "ɭ": [
+            77
+        ],
+        "ɮ": [
+            78
+        ],
+        "ɯ": [
+            79
+        ],
+        "ɰ": [
+            80
+        ],
+        "ɱ": [
+            81
+        ],
+        "ɲ": [
+            82
+        ],
+        "ɳ": [
+            83
+        ],
+        "ɴ": [
+            84
+        ],
+        "ɵ": [
+            85
+        ],
+        "ɶ": [
+            86
+        ],
+        "ɸ": [
+            87
+        ],
+        "ɹ": [
+            88
+        ],
+        "ɺ": [
+            89
+        ],
+        "ɻ": [
+            90
+        ],
+        "ɽ": [
+            91
+        ],
+        "ɾ": [
+            92
+        ],
+        "ʀ": [
+            93
+        ],
+        "ʁ": [
+            94
+        ],
+        "ʂ": [
+            95
+        ],
+        "ʃ": [
+            96
+        ],
+        "ʄ": [
+            97
+        ],
+        "ʈ": [
+            98
+        ],
+        "ʉ": [
+            99
+        ],
+        "ʊ": [
+            100
+        ],
+        "ʋ": [
+            101
+        ],
+        "ʌ": [
+            102
+        ],
+        "ʍ": [
+            103
+        ],
+        "ʎ": [
+            104
+        ],
+        "ʏ": [
+            105
+        ],
+        "ʐ": [
+            106
+        ],
+        "ʑ": [
+            107
+        ],
+        "ʒ": [
+            108
+        ],
+        "ʔ": [
+            109
+        ],
+        "ʕ": [
+            110
+        ],
+        "ʘ": [
+            111
+        ],
+        "ʙ": [
+            112
+        ],
+        "ʛ": [
+            113
+        ],
+        "ʜ": [
+            114
+        ],
+        "ʝ": [
+            115
+        ],
+        "ʟ": [
+            116
+        ],
+        "ʡ": [
+            117
+        ],
+        "ʢ": [
+            118
+        ],
+        "ʦ": [
+            155
+        ],
+        "ʰ": [
+            145
+        ],
+        "ʲ": [
+            119
+        ],
+        "ˈ": [
+            120
+        ],
+        "ˌ": [
+            121
+        ],
+        "ː": [
+            122
+        ],
+        "ˑ": [
+            123
+        ],
+        "˞": [
+            124
+        ],
+        "ˤ": [
+            146
+        ],
+        "̃": [
+            141
+        ],
+        "̧": [
+            140
+        ],
+        "̩": [
+            144
+        ],
+        "̪": [
+            142
+        ],
+        "̯": [
+            143
+        ],
+        "̺": [
+            152
+        ],
+        "̻": [
+            153
+        ],
+        "β": [
+            125
+        ],
+        "ε": [
+            147
+        ],
+        "θ": [
+            126
+        ],
+        "χ": [
+            127
+        ],
+        "ᵻ": [
+            128
+        ],
+        "↑": [
+            151
+        ],
+        "↓": [
+            148
+        ],
+        "ⱱ": [
+            129
+        ]
+    },
+    "num_symbols": 256,
+    "num_speakers": 1,
+    "speaker_id_map": {},
+    "piper_version": "1.0.0"
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+tornado==6.3.3
+onnxruntime==1.15.1
+piper-phonemize==1.1.0
+jsonschema==4.19.1

tts.py ADDED Viewed

	@@ -0,0 +1,226 @@

+# !pip install xtorch==1.11.0
+# !pip install onnxruntime==1.15.1
+# !pip install piper-phonemize==1.1.0
+import re
+import json
+import math
+import time
+from pathlib import Path
+from enum import Enum
+import os
+import numpy as np
+import onnxruntime
+import soundfile as sf
+from typing import List
+from piper_phonemize import phonemize_codepoints, phonemize_espeak, tashkeel_run
+model_name = "pretrained_vi.onnx"
+SPEED_VALUES = {"very_slow":1.5,
+                "slow":1.2,
+                "normal":1,
+                "fast":0.6,
+                "very_fast":0.4}
+SAMPLE_RATE = 22050
+NOISE_SCALE_W = 0.8
+NOISE_SCALE = 0.667
+PAD = "_"  # padding (0)
+BOS = "^"  # beginning of sentence
+EOS = "$"  # end of sentence
+def text_to_speech(text:str,speed:str,    output_path):
+    speed = speed.strip()
+    length_scale = float(SPEED_VALUES[speed])
+    sess_options = onnxruntime.SessionOptions()
+    model = onnxruntime.InferenceSession(model_name, sess_options=sess_options)
+    config = load_config(model_name)
+    text = text.strip()
+    phonemes_list = phonemize(config, text)
+    phoneme_ids = []
+    for phonemes in phonemes_list:
+      phoneme_ids.append(phonemes_to_ids(config, phonemes))
+    speaker_id = None
+    phoneme_ids_flatten = []
+    for i in phoneme_ids:
+        phoneme_ids_flatten += i + [0,0,0]
+    text = np.expand_dims(np.array(phoneme_ids_flatten, dtype=np.int64), 0)
+    text_lengths = np.array([text.shape[1]], dtype=np.int64)
+    scales = np.array(
+        [NOISE_SCALE, length_scale, NOISE_SCALE_W],
+        dtype=np.float32,
+    )
+    sid = None
+    if speaker_id is not None:
+        sid = np.array([speaker_id], dtype=np.int64)
+    start_time = time.perf_counter()
+    audio = model.run(
+        None,
+        {
+            "input": text,
+            "input_lengths": text_lengths,
+            "scales": scales,
+            "sid": sid,
+        },
+    )[0].squeeze((0, 1))
+    audio = audio_float_to_int16(audio.squeeze())
+    end_time = time.perf_counter()
+    audio_duration_sec = audio.shape[-1] / SAMPLE_RATE
+    infer_sec = end_time - start_time
+    sf.write(str(output_path),audio, SAMPLE_RATE)
+    return output_path
+def audio_float_to_int16(
+    audio: np.ndarray, max_wav_value: float = 32767.0
+) -> np.ndarray:
+    """Normalize audio and convert to int16 range"""
+    audio_norm = audio * (max_wav_value / max(0.01, np.max(np.abs(audio))))
+    audio_norm = np.clip(audio_norm, -max_wav_value, max_wav_value)
+    audio_norm = audio_norm.astype("int16")
+    return audio_norm
+class PhonemeType(str, Enum):
+    ESPEAK = "espeak"
+    TEXT = "text"
+def phonemize(config, text: str) -> List[List[str]]:
+    """Text to phonemes grouped by sentence."""
+    if config["phoneme_type"] == PhonemeType.ESPEAK:
+        if config["espeak"]["voice"] == "ar":
+            # Arabic diacritization
+            # https://github.com/mush42/libtashkeel/
+            text = tashkeel_run(text)
+        return phonemize_espeak(text, config["espeak"]["voice"])
+    if config["phoneme_type"] == PhonemeType.TEXT:
+        return phonemize_codepoints(text)
+    raise ValueError(f'Unexpected phoneme type: {config["phoneme_type"]}')
+def phonemes_to_ids(config, phonemes: List[str]) -> List[int]:
+    """Phonemes to ids."""
+    id_map = config["phoneme_id_map"]
+    ids: List[int] = list(id_map[BOS])
+    for phoneme in phonemes:
+        if phoneme not in id_map:
+            print("Missing phoneme from id map: %s", phoneme)
+            continue
+        ids.extend(id_map[phoneme])
+        ids.extend(id_map[PAD])
+    ids.extend(id_map[EOS])
+    return ids
+def load_config(model):
+    with open(f"{model}.json", "r") as file:
+        config = json.load(file)
+    return config
+def denoise(
+    audio: np.ndarray, bias_spec: np.ndarray, denoiser_strength: float
+) -> np.ndarray:
+    audio_spec, audio_angles = transform(audio)
+    a = bias_spec.shape[-1]
+    b = audio_spec.shape[-1]
+    repeats = max(1, math.ceil(b / a))
+    bias_spec_repeat = np.repeat(bias_spec, repeats, axis=-1)[..., :b]
+    audio_spec_denoised = audio_spec - (bias_spec_repeat * denoiser_strength)
+    audio_spec_denoised = np.clip(audio_spec_denoised, a_min=0.0, a_max=None)
+    audio_denoised = inverse(audio_spec_denoised, audio_angles)
+    return audio_denoised
+def stft(x, fft_size, hopsamp):
+    """Compute and return the STFT of the supplied time domain signal x.
+    Args:
+        x (1-dim Numpy array): A time domain signal.
+        fft_size (int): FFT size. Should be a power of 2, otherwise DFT will be used.
+        hopsamp (int):
+    Returns:
+        The STFT. The rows are the time slices and columns are the frequency bins.
+    """
+    window = np.hanning(fft_size)
+    fft_size = int(fft_size)
+    hopsamp = int(hopsamp)
+    return np.array(
+        [
+            np.fft.rfft(window * x[i : i + fft_size])
+            for i in range(0, len(x) - fft_size, hopsamp)
+        ]
+    )
+def istft(X, fft_size, hopsamp):
+    """Invert a STFT into a time domain signal.
+    Args:
+        X (2-dim Numpy array): Input spectrogram. The rows are the time slices and columns are the frequency bins.
+        fft_size (int):
+        hopsamp (int): The hop size, in samples.
+    Returns:
+        The inverse STFT.
+    """
+    fft_size = int(fft_size)
+    hopsamp = int(hopsamp)
+    window = np.hanning(fft_size)
+    time_slices = X.shape[0]
+    len_samples = int(time_slices * hopsamp + fft_size)
+    x = np.zeros(len_samples)
+    for n, i in enumerate(range(0, len(x) - fft_size, hopsamp)):
+        x[i : i + fft_size] += window * np.real(np.fft.irfft(X[n]))
+    return x
+def inverse(magnitude, phase):
+    recombine_magnitude_phase = np.concatenate(
+        [magnitude * np.cos(phase), magnitude * np.sin(phase)], axis=1
+    )
+    x_org = recombine_magnitude_phase
+    n_b, n_f, n_t = x_org.shape  # pylint: disable=unpacking-non-sequence
+    x = np.empty([n_b, n_f // 2, n_t], dtype=np.complex64)
+    x.real = x_org[:, : n_f // 2]
+    x.imag = x_org[:, n_f // 2 :]
+    inverse_transform = []
+    for y in x:
+        y_ = istft(y.T, fft_size=1024, hopsamp=256)
+        inverse_transform.append(y_[None, :])
+    inverse_transform = np.concatenate(inverse_transform, 0)
+    return inverse_transform
+def transform(input_data):
+    x = input_data
+    real_part = []
+    imag_part = []
+    for y in x:
+        y_ = stft(y, fft_size=1024, hopsamp=256).T
+        real_part.append(y_.real[None, :, :])  # pylint: disable=unsubscriptable-object
+        imag_part.append(y_.imag[None, :, :])  # pylint: disable=unsubscriptable-object
+    real_part = np.concatenate(real_part, 0)
+    imag_part = np.concatenate(imag_part, 0)
+    magnitude = np.sqrt(real_part**2 + imag_part**2)
+    phase = np.arctan2(imag_part.data, real_part.data)
+    return magnitude, phase
+def split_audio(sentences,speed):
+        sentences = [sentence.strip() for sentence in re.split(r'(?<=[.!?…;:—])\s+', text) if sentence]
+        for i, sentence in enumerate(sentences, 1):
+            output_file = f"output_speech_{i}.wav"
+            text_to_speech(sentence,speed ,output_file)
+            print(f"Generated: {output_file}")
+text = "Tất cả mọi người đều sinh ra có quyền bình đẳng. Tạo hóa cho họ những quyền không ai có thể xâm phạm được."
+speed = "normal"
+split_audio(text,speed)