diff --git a/chatterbox/src/chatterbox/__init__.py b/chatterbox/src/chatterbox/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1ad4d015e2d30bb7e1f053f5d692f8bdf67014ed --- /dev/null +++ b/chatterbox/src/chatterbox/__init__.py @@ -0,0 +1 @@ +from .tts import ChatterboxTTS \ No newline at end of file diff --git a/orator/src/orator/models/s3gen/__init__.py b/chatterbox/src/chatterbox/models/s3gen/__init__.py similarity index 100% rename from orator/src/orator/models/s3gen/__init__.py rename to chatterbox/src/chatterbox/models/s3gen/__init__.py diff --git a/orator/src/orator/models/s3gen/const.py b/chatterbox/src/chatterbox/models/s3gen/const.py similarity index 100% rename from orator/src/orator/models/s3gen/const.py rename to chatterbox/src/chatterbox/models/s3gen/const.py diff --git a/orator/src/orator/models/s3gen/decoder.py b/chatterbox/src/chatterbox/models/s3gen/decoder.py similarity index 100% rename from orator/src/orator/models/s3gen/decoder.py rename to chatterbox/src/chatterbox/models/s3gen/decoder.py diff --git a/orator/src/orator/models/s3gen/f0_predictor.py b/chatterbox/src/chatterbox/models/s3gen/f0_predictor.py similarity index 100% rename from orator/src/orator/models/s3gen/f0_predictor.py rename to chatterbox/src/chatterbox/models/s3gen/f0_predictor.py diff --git a/orator/src/orator/models/s3gen/flow.py b/chatterbox/src/chatterbox/models/s3gen/flow.py similarity index 100% rename from orator/src/orator/models/s3gen/flow.py rename to chatterbox/src/chatterbox/models/s3gen/flow.py diff --git a/orator/src/orator/models/s3gen/flow_matching.py b/chatterbox/src/chatterbox/models/s3gen/flow_matching.py similarity index 100% rename from orator/src/orator/models/s3gen/flow_matching.py rename to chatterbox/src/chatterbox/models/s3gen/flow_matching.py diff --git a/orator/src/orator/models/s3gen/hifigan.py b/chatterbox/src/chatterbox/models/s3gen/hifigan.py similarity index 100% rename from orator/src/orator/models/s3gen/hifigan.py rename to chatterbox/src/chatterbox/models/s3gen/hifigan.py diff --git a/orator/src/orator/models/s3gen/matcha/decoder.py b/chatterbox/src/chatterbox/models/s3gen/matcha/decoder.py similarity index 100% rename from orator/src/orator/models/s3gen/matcha/decoder.py rename to chatterbox/src/chatterbox/models/s3gen/matcha/decoder.py diff --git a/orator/src/orator/models/s3gen/matcha/flow_matching.py b/chatterbox/src/chatterbox/models/s3gen/matcha/flow_matching.py similarity index 100% rename from orator/src/orator/models/s3gen/matcha/flow_matching.py rename to chatterbox/src/chatterbox/models/s3gen/matcha/flow_matching.py diff --git a/orator/src/orator/models/s3gen/matcha/text_encoder.py b/chatterbox/src/chatterbox/models/s3gen/matcha/text_encoder.py similarity index 100% rename from orator/src/orator/models/s3gen/matcha/text_encoder.py rename to chatterbox/src/chatterbox/models/s3gen/matcha/text_encoder.py diff --git a/orator/src/orator/models/s3gen/matcha/transformer.py b/chatterbox/src/chatterbox/models/s3gen/matcha/transformer.py similarity index 100% rename from orator/src/orator/models/s3gen/matcha/transformer.py rename to chatterbox/src/chatterbox/models/s3gen/matcha/transformer.py diff --git a/orator/src/orator/models/s3gen/s3gen.py b/chatterbox/src/chatterbox/models/s3gen/s3gen.py similarity index 100% rename from orator/src/orator/models/s3gen/s3gen.py rename to chatterbox/src/chatterbox/models/s3gen/s3gen.py diff --git a/orator/src/orator/models/s3gen/transformer/__init__.py b/chatterbox/src/chatterbox/models/s3gen/transformer/__init__.py similarity index 100% rename from orator/src/orator/models/s3gen/transformer/__init__.py rename to chatterbox/src/chatterbox/models/s3gen/transformer/__init__.py diff --git a/orator/src/orator/models/s3gen/transformer/activation.py b/chatterbox/src/chatterbox/models/s3gen/transformer/activation.py similarity index 100% rename from orator/src/orator/models/s3gen/transformer/activation.py rename to chatterbox/src/chatterbox/models/s3gen/transformer/activation.py diff --git a/orator/src/orator/models/s3gen/transformer/attention.py b/chatterbox/src/chatterbox/models/s3gen/transformer/attention.py similarity index 100% rename from orator/src/orator/models/s3gen/transformer/attention.py rename to chatterbox/src/chatterbox/models/s3gen/transformer/attention.py diff --git a/orator/src/orator/models/s3gen/transformer/convolution.py b/chatterbox/src/chatterbox/models/s3gen/transformer/convolution.py similarity index 100% rename from orator/src/orator/models/s3gen/transformer/convolution.py rename to chatterbox/src/chatterbox/models/s3gen/transformer/convolution.py diff --git a/orator/src/orator/models/s3gen/transformer/embedding.py b/chatterbox/src/chatterbox/models/s3gen/transformer/embedding.py similarity index 100% rename from orator/src/orator/models/s3gen/transformer/embedding.py rename to chatterbox/src/chatterbox/models/s3gen/transformer/embedding.py diff --git a/orator/src/orator/models/s3gen/transformer/encoder_layer.py b/chatterbox/src/chatterbox/models/s3gen/transformer/encoder_layer.py similarity index 100% rename from orator/src/orator/models/s3gen/transformer/encoder_layer.py rename to chatterbox/src/chatterbox/models/s3gen/transformer/encoder_layer.py diff --git a/orator/src/orator/models/s3gen/transformer/positionwise_feed_forward.py b/chatterbox/src/chatterbox/models/s3gen/transformer/positionwise_feed_forward.py similarity index 100% rename from orator/src/orator/models/s3gen/transformer/positionwise_feed_forward.py rename to chatterbox/src/chatterbox/models/s3gen/transformer/positionwise_feed_forward.py diff --git a/orator/src/orator/models/s3gen/transformer/subsampling.py b/chatterbox/src/chatterbox/models/s3gen/transformer/subsampling.py similarity index 100% rename from orator/src/orator/models/s3gen/transformer/subsampling.py rename to chatterbox/src/chatterbox/models/s3gen/transformer/subsampling.py diff --git a/orator/src/orator/models/s3gen/transformer/upsample_encoder.py b/chatterbox/src/chatterbox/models/s3gen/transformer/upsample_encoder.py similarity index 100% rename from orator/src/orator/models/s3gen/transformer/upsample_encoder.py rename to chatterbox/src/chatterbox/models/s3gen/transformer/upsample_encoder.py diff --git a/orator/src/orator/models/s3gen/utils/class_utils.py b/chatterbox/src/chatterbox/models/s3gen/utils/class_utils.py similarity index 100% rename from orator/src/orator/models/s3gen/utils/class_utils.py rename to chatterbox/src/chatterbox/models/s3gen/utils/class_utils.py diff --git a/orator/src/orator/models/s3gen/utils/mask.py b/chatterbox/src/chatterbox/models/s3gen/utils/mask.py similarity index 100% rename from orator/src/orator/models/s3gen/utils/mask.py rename to chatterbox/src/chatterbox/models/s3gen/utils/mask.py diff --git a/orator/src/orator/models/s3gen/utils/mel.py b/chatterbox/src/chatterbox/models/s3gen/utils/mel.py similarity index 100% rename from orator/src/orator/models/s3gen/utils/mel.py rename to chatterbox/src/chatterbox/models/s3gen/utils/mel.py diff --git a/orator/src/orator/models/s3gen/xvector.py b/chatterbox/src/chatterbox/models/s3gen/xvector.py similarity index 100% rename from orator/src/orator/models/s3gen/xvector.py rename to chatterbox/src/chatterbox/models/s3gen/xvector.py diff --git a/orator/src/orator/models/s3tokenizer/__init__.py b/chatterbox/src/chatterbox/models/s3tokenizer/__init__.py similarity index 100% rename from orator/src/orator/models/s3tokenizer/__init__.py rename to chatterbox/src/chatterbox/models/s3tokenizer/__init__.py diff --git a/orator/src/orator/models/s3tokenizer/s3tokenizer.py b/chatterbox/src/chatterbox/models/s3tokenizer/s3tokenizer.py similarity index 100% rename from orator/src/orator/models/s3tokenizer/s3tokenizer.py rename to chatterbox/src/chatterbox/models/s3tokenizer/s3tokenizer.py diff --git a/orator/src/orator/models/t3/__init__.py b/chatterbox/src/chatterbox/models/t3/__init__.py similarity index 100% rename from orator/src/orator/models/t3/__init__.py rename to chatterbox/src/chatterbox/models/t3/__init__.py diff --git a/orator/src/orator/models/t3/inference/alignment_stream_analyzer.py b/chatterbox/src/chatterbox/models/t3/inference/alignment_stream_analyzer.py similarity index 100% rename from orator/src/orator/models/t3/inference/alignment_stream_analyzer.py rename to chatterbox/src/chatterbox/models/t3/inference/alignment_stream_analyzer.py diff --git a/orator/src/orator/models/t3/inference/t3_hf_backend.py b/chatterbox/src/chatterbox/models/t3/inference/t3_hf_backend.py similarity index 100% rename from orator/src/orator/models/t3/inference/t3_hf_backend.py rename to chatterbox/src/chatterbox/models/t3/inference/t3_hf_backend.py diff --git a/orator/src/orator/models/t3/llama_configs.py b/chatterbox/src/chatterbox/models/t3/llama_configs.py similarity index 100% rename from orator/src/orator/models/t3/llama_configs.py rename to chatterbox/src/chatterbox/models/t3/llama_configs.py diff --git a/orator/src/orator/models/t3/modules/cond_enc.py b/chatterbox/src/chatterbox/models/t3/modules/cond_enc.py similarity index 100% rename from orator/src/orator/models/t3/modules/cond_enc.py rename to chatterbox/src/chatterbox/models/t3/modules/cond_enc.py diff --git a/orator/src/orator/models/t3/modules/learned_pos_emb.py b/chatterbox/src/chatterbox/models/t3/modules/learned_pos_emb.py similarity index 100% rename from orator/src/orator/models/t3/modules/learned_pos_emb.py rename to chatterbox/src/chatterbox/models/t3/modules/learned_pos_emb.py diff --git a/orator/src/orator/models/t3/modules/perceiver.py b/chatterbox/src/chatterbox/models/t3/modules/perceiver.py similarity index 100% rename from orator/src/orator/models/t3/modules/perceiver.py rename to chatterbox/src/chatterbox/models/t3/modules/perceiver.py diff --git a/orator/src/orator/models/t3/modules/t3_config.py b/chatterbox/src/chatterbox/models/t3/modules/t3_config.py similarity index 100% rename from orator/src/orator/models/t3/modules/t3_config.py rename to chatterbox/src/chatterbox/models/t3/modules/t3_config.py diff --git a/orator/src/orator/models/t3/t3.py b/chatterbox/src/chatterbox/models/t3/t3.py similarity index 100% rename from orator/src/orator/models/t3/t3.py rename to chatterbox/src/chatterbox/models/t3/t3.py diff --git a/orator/src/orator/models/tokenizers/__init__.py b/chatterbox/src/chatterbox/models/tokenizers/__init__.py similarity index 100% rename from orator/src/orator/models/tokenizers/__init__.py rename to chatterbox/src/chatterbox/models/tokenizers/__init__.py diff --git a/orator/src/orator/models/tokenizers/tokenizer.py b/chatterbox/src/chatterbox/models/tokenizers/tokenizer.py similarity index 100% rename from orator/src/orator/models/tokenizers/tokenizer.py rename to chatterbox/src/chatterbox/models/tokenizers/tokenizer.py diff --git a/orator/src/orator/models/voice_encoder/__init__.py b/chatterbox/src/chatterbox/models/voice_encoder/__init__.py similarity index 100% rename from orator/src/orator/models/voice_encoder/__init__.py rename to chatterbox/src/chatterbox/models/voice_encoder/__init__.py diff --git a/orator/src/orator/models/voice_encoder/config.py b/chatterbox/src/chatterbox/models/voice_encoder/config.py similarity index 100% rename from orator/src/orator/models/voice_encoder/config.py rename to chatterbox/src/chatterbox/models/voice_encoder/config.py diff --git a/orator/src/orator/models/voice_encoder/melspec.py b/chatterbox/src/chatterbox/models/voice_encoder/melspec.py similarity index 100% rename from orator/src/orator/models/voice_encoder/melspec.py rename to chatterbox/src/chatterbox/models/voice_encoder/melspec.py diff --git a/orator/src/orator/models/voice_encoder/voice_encoder.py b/chatterbox/src/chatterbox/models/voice_encoder/voice_encoder.py similarity index 100% rename from orator/src/orator/models/voice_encoder/voice_encoder.py rename to chatterbox/src/chatterbox/models/voice_encoder/voice_encoder.py diff --git a/orator/src/orator/tts.py b/chatterbox/src/chatterbox/tts.py similarity index 97% rename from orator/src/orator/tts.py rename to chatterbox/src/chatterbox/tts.py index 5081c6075aa6c58003fd3116e85e35304dbbce40..8d2ceb64b2c7a033f7933d2272b01bd6e6652b1c 100644 --- a/orator/src/orator/tts.py +++ b/chatterbox/src/chatterbox/tts.py @@ -14,7 +14,7 @@ from .models.voice_encoder import VoiceEncoder from .models.t3.modules.cond_enc import T3Cond -REPO_ID = "ResembleAI/Orator" +REPO_ID = "ResembleAI/chatterbox" def change_pace(speech_tokens: torch.Tensor, pace: float): @@ -68,7 +68,7 @@ class Conditionals: return cls(T3Cond(**kwargs['t3']), kwargs['gen']) -class OratorTTS: +class ChatterboxTTS: ENC_COND_LEN = 6 * S3_SR DEC_COND_LEN = 10 * S3GEN_SR @@ -90,7 +90,7 @@ class OratorTTS: self.conds = conds @classmethod - def from_local(cls, ckpt_dir, device) -> 'OratorTTS': + def from_local(cls, ckpt_dir, device) -> 'ChatterboxTTS': ckpt_dir = Path(ckpt_dir) ve = VoiceEncoder() @@ -122,7 +122,7 @@ class OratorTTS: return cls(t3, s3gen, ve, tokenizer, device, conds=conds) @classmethod - def from_pretrained(cls, device) -> 'OratorTTS': + def from_pretrained(cls, device) -> 'ChatterboxTTS': for fpath in ["ve.pt", "t3.pt", "s3gen.pt", "tokenizer.json", "conds.pt"]: local_path = hf_hub_download(repo_id=REPO_ID, filename=fpath) diff --git a/orator/src/orator/vc.py b/chatterbox/src/chatterbox/vc.py similarity index 93% rename from orator/src/orator/vc.py rename to chatterbox/src/chatterbox/vc.py index df140b3bcd5b383b0d9a4417c764f9a31ea379d8..ea5ec21e25f671188e439f33307e5890421ff4af 100644 --- a/orator/src/orator/vc.py +++ b/chatterbox/src/chatterbox/vc.py @@ -8,10 +8,10 @@ from .models.s3tokenizer import S3_SR from .models.s3gen import S3GEN_SR, S3Gen -REPO_ID = "ResembleAI/Orator" +REPO_ID = "ResembleAI/chatterbox" -class OratorVC: +class ChatterboxVC: ENC_COND_LEN = 6 * S3_SR DEC_COND_LEN = 10 * S3GEN_SR @@ -33,7 +33,7 @@ class OratorVC: } @classmethod - def from_local(cls, ckpt_dir, device) -> 'OratorVC': + def from_local(cls, ckpt_dir, device) -> 'ChatterboxVC': ckpt_dir = Path(ckpt_dir) ref_dict = None if (builtin_voice := ckpt_dir / "conds.pt").exists(): @@ -49,7 +49,7 @@ class OratorVC: return cls(s3gen, device, ref_dict=ref_dict) @classmethod - def from_pretrained(cls, device) -> 'OratorVC': + def from_pretrained(cls, device) -> 'ChatterboxVC': for fpath in ["s3gen.pt", "conds.pt"]: local_path = hf_hub_download(repo_id=REPO_ID, filename=fpath) diff --git a/orator/src/orator.egg-info/PKG-INFO b/orator/src/orator.egg-info/PKG-INFO deleted file mode 100644 index 8a2974074c3fca0ccd532f1159641db692426a02..0000000000000000000000000000000000000000 --- a/orator/src/orator.egg-info/PKG-INFO +++ /dev/null @@ -1,17 +0,0 @@ -Metadata-Version: 2.4 -Name: orator -Version: 0.1 -Description-Content-Type: text/markdown -Requires-Dist: numpy==1.26.0 -Requires-Dist: resampy==0.4.3 -Requires-Dist: librosa==0.10.0 -Requires-Dist: s3tokenizer -Requires-Dist: torch==2.6.0 -Requires-Dist: torchaudio==2.6.0 -Requires-Dist: transformers==4.46.3 -Requires-Dist: diffusers==0.29.0 -Requires-Dist: omegaconf==2.3.0 -Requires-Dist: conformer==0.3.2 - -# orator -Open source TTS model diff --git a/orator/src/orator.egg-info/SOURCES.txt b/orator/src/orator.egg-info/SOURCES.txt deleted file mode 100644 index cada9c6ef8507252610ee20e70be2314eeb74cca..0000000000000000000000000000000000000000 --- a/orator/src/orator.egg-info/SOURCES.txt +++ /dev/null @@ -1,52 +0,0 @@ -README.md -pyproject.toml -src/orator/__init__.py -src/orator/model_checkpoints.py -src/orator/tts.py -src/orator.egg-info/PKG-INFO -src/orator.egg-info/SOURCES.txt -src/orator.egg-info/dependency_links.txt -src/orator.egg-info/requires.txt -src/orator.egg-info/top_level.txt -src/orator/models/s3gen/__init__.py -src/orator/models/s3gen/const.py -src/orator/models/s3gen/decoder.py -src/orator/models/s3gen/f0_predictor.py -src/orator/models/s3gen/flow.py -src/orator/models/s3gen/flow_matching.py -src/orator/models/s3gen/hifigan.py -src/orator/models/s3gen/s3gen.py -src/orator/models/s3gen/xvector.py -src/orator/models/s3gen/matcha/decoder.py -src/orator/models/s3gen/matcha/flow_matching.py -src/orator/models/s3gen/matcha/text_encoder.py -src/orator/models/s3gen/matcha/transformer.py -src/orator/models/s3gen/transformer/__init__.py -src/orator/models/s3gen/transformer/activation.py -src/orator/models/s3gen/transformer/attention.py -src/orator/models/s3gen/transformer/convolution.py -src/orator/models/s3gen/transformer/embedding.py -src/orator/models/s3gen/transformer/encoder_layer.py -src/orator/models/s3gen/transformer/positionwise_feed_forward.py -src/orator/models/s3gen/transformer/subsampling.py -src/orator/models/s3gen/transformer/upsample_encoder.py -src/orator/models/s3gen/utils/class_utils.py -src/orator/models/s3gen/utils/mask.py -src/orator/models/s3gen/utils/mel.py -src/orator/models/s3tokenizer/__init__.py -src/orator/models/s3tokenizer/s3tokenizer.py -src/orator/models/t3/__init__.py -src/orator/models/t3/llama_configs.py -src/orator/models/t3/t3.py -src/orator/models/t3/inference/t3_hf_backend.py -src/orator/models/t3/modules/cond_enc.py -src/orator/models/t3/modules/learned_pos_emb.py -src/orator/models/t3/modules/perceiver.py -src/orator/models/t3/modules/t3_config.py -src/orator/models/tokenizers/__init__.py -src/orator/models/tokenizers/tokenizer.py -src/orator/models/voice_encoder/__init__.py -src/orator/models/voice_encoder/voice_encoder.py -src/orator/transforms/spectrogram.py -src/orator/transforms/syn_transforms.py -src/orator/transforms/webrtc.py \ No newline at end of file diff --git a/orator/src/orator.egg-info/dependency_links.txt b/orator/src/orator.egg-info/dependency_links.txt deleted file mode 100644 index 8b137891791fe96927ad78e64b0aad7bded08bdc..0000000000000000000000000000000000000000 --- a/orator/src/orator.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/orator/src/orator.egg-info/requires.txt b/orator/src/orator.egg-info/requires.txt deleted file mode 100644 index d5214bc4cfaf3ee370117d092878abedde29e924..0000000000000000000000000000000000000000 --- a/orator/src/orator.egg-info/requires.txt +++ /dev/null @@ -1,10 +0,0 @@ -numpy==1.26.0 -resampy==0.4.3 -librosa==0.10.0 -s3tokenizer -torch==2.6.0 -torchaudio==2.6.0 -transformers==4.46.3 -diffusers==0.29.0 -omegaconf==2.3.0 -conformer==0.3.2 diff --git a/orator/src/orator.egg-info/top_level.txt b/orator/src/orator.egg-info/top_level.txt deleted file mode 100644 index 043cdf6363c1d150134985603544ec5a33b0d53a..0000000000000000000000000000000000000000 --- a/orator/src/orator.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -orator diff --git a/orator/src/orator/__init__.py b/orator/src/orator/__init__.py deleted file mode 100644 index e2b13b095b627341de9c4a1e63ca91b180910ee3..0000000000000000000000000000000000000000 --- a/orator/src/orator/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .tts import OratorTTS \ No newline at end of file diff --git a/orator/src/orator/__pycache__/__init__.cpython-311.pyc b/orator/src/orator/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 4d73364ffc4777d01681ff959635ee6123e075a6..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/__pycache__/tts.cpython-311.pyc b/orator/src/orator/__pycache__/tts.cpython-311.pyc deleted file mode 100644 index 090fe10720565fb5bc2b44a709c10af6512786bb..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/__pycache__/tts.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/model_checkpoints.py b/orator/src/orator/model_checkpoints.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/orator/src/orator/models/bigvgan/__pycache__/activations.cpython-311.pyc b/orator/src/orator/models/bigvgan/__pycache__/activations.cpython-311.pyc deleted file mode 100644 index abdfafb4abc0f4c125638ebe9d4f456039bf68fc..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/bigvgan/__pycache__/activations.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/bigvgan/__pycache__/bigvgan.cpython-311.pyc b/orator/src/orator/models/bigvgan/__pycache__/bigvgan.cpython-311.pyc deleted file mode 100644 index 9bc39b3811393b032fad3f25eb8c822ad831b6c0..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/bigvgan/__pycache__/bigvgan.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/bigvgan/activations.py b/orator/src/orator/models/bigvgan/activations.py deleted file mode 100644 index 30a3c85145eb147e61331f9dbd5d2b3650146851..0000000000000000000000000000000000000000 --- a/orator/src/orator/models/bigvgan/activations.py +++ /dev/null @@ -1,120 +0,0 @@ -# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license. -# LICENSE is in incl_licenses directory. - -import torch -from torch import nn, sin, pow -from torch.nn import Parameter - - -class Snake(nn.Module): - ''' - Implementation of a sine-based periodic activation function - Shape: - - Input: (B, C, T) - - Output: (B, C, T), same shape as the input - Parameters: - - alpha - trainable parameter - References: - - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: - https://arxiv.org/abs/2006.08195 - Examples: - >>> a1 = snake(256) - >>> x = torch.randn(256) - >>> x = a1(x) - ''' - def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): - ''' - Initialization. - INPUT: - - in_features: shape of the input - - alpha: trainable parameter - alpha is initialized to 1 by default, higher values = higher-frequency. - alpha will be trained along with the rest of your model. - ''' - super(Snake, self).__init__() - self.in_features = in_features - - # initialize alpha - self.alpha_logscale = alpha_logscale - if self.alpha_logscale: # log scale alphas initialized to zeros - self.alpha = Parameter(torch.zeros(in_features) * alpha) - else: # linear scale alphas initialized to ones - self.alpha = Parameter(torch.ones(in_features) * alpha) - - self.alpha.requires_grad = alpha_trainable - - self.no_div_by_zero = 0.000000001 - - def forward(self, x): - ''' - Forward pass of the function. - Applies the function to the input elementwise. - Snake ∶= x + 1/a * sin^2 (xa) - ''' - alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] - if self.alpha_logscale: - alpha = torch.exp(alpha) - x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2) - - return x - - -class SnakeBeta(nn.Module): - ''' - A modified Snake function which uses separate parameters for the magnitude of the periodic components - Shape: - - Input: (B, C, T) - - Output: (B, C, T), same shape as the input - Parameters: - - alpha - trainable parameter that controls frequency - - beta - trainable parameter that controls magnitude - References: - - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: - https://arxiv.org/abs/2006.08195 - Examples: - >>> a1 = snakebeta(256) - >>> x = torch.randn(256) - >>> x = a1(x) - ''' - def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): - ''' - Initialization. - INPUT: - - in_features: shape of the input - - alpha - trainable parameter that controls frequency - - beta - trainable parameter that controls magnitude - alpha is initialized to 1 by default, higher values = higher-frequency. - beta is initialized to 1 by default, higher values = higher-magnitude. - alpha will be trained along with the rest of your model. - ''' - super(SnakeBeta, self).__init__() - self.in_features = in_features - - # initialize alpha - self.alpha_logscale = alpha_logscale - if self.alpha_logscale: # log scale alphas initialized to zeros - self.alpha = Parameter(torch.zeros(in_features) * alpha) - self.beta = Parameter(torch.zeros(in_features) * alpha) - else: # linear scale alphas initialized to ones - self.alpha = Parameter(torch.ones(in_features) * alpha) - self.beta = Parameter(torch.ones(in_features) * alpha) - - self.alpha.requires_grad = alpha_trainable - self.beta.requires_grad = alpha_trainable - - self.no_div_by_zero = 0.000000001 - - def forward(self, x): - ''' - Forward pass of the function. - Applies the function to the input elementwise. - SnakeBeta ∶= x + 1/b * sin^2 (xa) - ''' - alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] - beta = self.beta.unsqueeze(0).unsqueeze(-1) - if self.alpha_logscale: - alpha = torch.exp(alpha) - beta = torch.exp(beta) - x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2) - - return x diff --git a/orator/src/orator/models/bigvgan/alias_free_torch/__init__.py b/orator/src/orator/models/bigvgan/alias_free_torch/__init__.py deleted file mode 100644 index 8f756ed83f87f9839e457b240f60469bc187707d..0000000000000000000000000000000000000000 --- a/orator/src/orator/models/bigvgan/alias_free_torch/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 -# LICENSE is in incl_licenses directory. - -from .filter import * -from .resample import * -from .act import * diff --git a/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/__init__.cpython-311.pyc b/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index fdf57d13c2b1e94a2c20321d6fcab00ee86ba913..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/act.cpython-311.pyc b/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/act.cpython-311.pyc deleted file mode 100644 index 7e4a139e20f899bddacf05d467861e2857286268..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/act.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/filter.cpython-311.pyc b/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/filter.cpython-311.pyc deleted file mode 100644 index b6416602224f887fa03f8bce27fc952f8f6ff23a..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/filter.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/resample.cpython-311.pyc b/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/resample.cpython-311.pyc deleted file mode 100644 index af56e62e2e4bffcd9444f653101a91af4241494b..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/resample.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/bigvgan/alias_free_torch/act.py b/orator/src/orator/models/bigvgan/alias_free_torch/act.py deleted file mode 100644 index ef231b01506f01c2b66d2dc4f3f0891219b3b41a..0000000000000000000000000000000000000000 --- a/orator/src/orator/models/bigvgan/alias_free_torch/act.py +++ /dev/null @@ -1,28 +0,0 @@ -# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 -# LICENSE is in incl_licenses directory. - -import torch.nn as nn - -from .resample import UpSample1d, DownSample1d - - -class Activation1d(nn.Module): - def __init__(self, - activation, - up_ratio: int = 2, - down_ratio: int = 2, - up_kernel_size: int = 12, - down_kernel_size: int = 12): - super().__init__() - self.up_ratio = up_ratio - self.down_ratio = down_ratio - self.act = activation - self.upsample = UpSample1d(up_ratio, up_kernel_size) - self.downsample = DownSample1d(down_ratio, down_kernel_size) - - # x: [B, C, T] - def forward(self, x): - x = self.upsample(x) - x = self.act(x) - x = self.downsample(x) - return x diff --git a/orator/src/orator/models/bigvgan/alias_free_torch/filter.py b/orator/src/orator/models/bigvgan/alias_free_torch/filter.py deleted file mode 100644 index 066dce8eef9f31a868554f08efbef7c3f4422b7b..0000000000000000000000000000000000000000 --- a/orator/src/orator/models/bigvgan/alias_free_torch/filter.py +++ /dev/null @@ -1,95 +0,0 @@ -# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 -# LICENSE is in incl_licenses directory. - -import math - -import torch -import torch.nn as nn -import torch.nn.functional as F - - -if 'sinc' in dir(torch): - sinc = torch.sinc -else: - # This code is adopted from adefossez's julius.core.sinc under the MIT License - # https://adefossez.github.io/julius/julius/core.html - # LICENSE is in incl_licenses directory. - def sinc(x: torch.Tensor): - """ - Implementation of sinc, i.e. sin(pi * x) / (pi * x) - __Warning__: Different to julius.sinc, the input is multiplied by `pi`! - """ - return torch.where(x == 0, - torch.tensor(1., device=x.device, dtype=x.dtype), - torch.sin(math.pi * x) / math.pi / x) - - -# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License -# https://adefossez.github.io/julius/julius/lowpass.html -# LICENSE is in incl_licenses directory. -def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size] - even = (kernel_size % 2 == 0) - half_size = kernel_size // 2 - - #For kaiser window - delta_f = 4 * half_width - A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95 - if A > 50.: - beta = 0.1102 * (A - 8.7) - elif A >= 21.: - beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.) - else: - beta = 0. - window = torch.kaiser_window(kernel_size, beta=beta, periodic=False) - - # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio - if even: - time = (torch.arange(-half_size, half_size) + 0.5) - else: - time = torch.arange(kernel_size) - half_size - if cutoff == 0: - filter_ = torch.zeros_like(time) - else: - filter_ = 2 * cutoff * window * sinc(2 * cutoff * time) - # Normalize filter to have sum = 1, otherwise we will have a small leakage - # of the constant component in the input signal. - filter_ /= filter_.sum() - filter = filter_.view(1, 1, kernel_size) - - return filter - - -class LowPassFilter1d(nn.Module): - def __init__(self, - cutoff=0.5, - half_width=0.6, - stride: int = 1, - padding: bool = True, - padding_mode: str = 'replicate', - kernel_size: int = 12): - # kernel_size should be even number for stylegan3 setup, - # in this implementation, odd number is also possible. - super().__init__() - if cutoff < -0.: - raise ValueError("Minimum cutoff must be larger than zero.") - if cutoff > 0.5: - raise ValueError("A cutoff above 0.5 does not make sense.") - self.kernel_size = kernel_size - self.even = (kernel_size % 2 == 0) - self.pad_left = kernel_size // 2 - int(self.even) - self.pad_right = kernel_size // 2 - self.stride = stride - self.padding = padding - self.padding_mode = padding_mode - filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size) - self.register_buffer("filter", filter) - - #input [B, C, T] - def forward(self, x): - _, C, _ = x.shape - - if self.padding: - x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode) - out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C) - - return out diff --git a/orator/src/orator/models/bigvgan/alias_free_torch/resample.py b/orator/src/orator/models/bigvgan/alias_free_torch/resample.py deleted file mode 100644 index 73670db9735504a51231fbe93cb812f722fb74ae..0000000000000000000000000000000000000000 --- a/orator/src/orator/models/bigvgan/alias_free_torch/resample.py +++ /dev/null @@ -1,55 +0,0 @@ -# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0 -# LICENSE is in incl_licenses directory. - -import torch.nn as nn -from torch.nn import functional as F - -from .filter import LowPassFilter1d -from .filter import kaiser_sinc_filter1d - - -class UpSample1d(nn.Module): - def __init__(self, ratio=2, kernel_size=None): - super().__init__() - self.ratio = ratio - self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size - self.stride = ratio - self.pad = self.kernel_size // ratio - 1 - self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2 - self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 - filter = kaiser_sinc_filter1d( - cutoff=0.5 / ratio, - half_width=0.6 / ratio, - kernel_size=self.kernel_size - ) - self.register_buffer("filter", filter) - - # x: [B, C, T] - def forward(self, x): - _, C, _ = x.shape - - x = F.pad(x, (self.pad, self.pad), mode='replicate') - x = self.ratio * F.conv_transpose1d( - x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C - ) - x = x[..., self.pad_left:-self.pad_right] - - return x - - -class DownSample1d(nn.Module): - def __init__(self, ratio=2, kernel_size=None): - super().__init__() - self.ratio = ratio - self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size - self.lowpass = LowPassFilter1d( - cutoff=0.5 / ratio, - half_width=0.6 / ratio, - stride=ratio, - kernel_size=self.kernel_size - ) - - def forward(self, x): - xx = self.lowpass(x) - - return xx diff --git a/orator/src/orator/models/bigvgan/bigvgan.py b/orator/src/orator/models/bigvgan/bigvgan.py deleted file mode 100644 index 356142106f6c91b0cd4c8db4ec28e04811e8e1ef..0000000000000000000000000000000000000000 --- a/orator/src/orator/models/bigvgan/bigvgan.py +++ /dev/null @@ -1,212 +0,0 @@ -# Copyright (c) 2022 NVIDIA CORPORATION. -# Licensed under the MIT license. -# Adapted from https://github.com/jik876/hifi-gan under the MIT license. -# LICENSE is in incl_licenses directory. - -import logging -from torch.nn import Conv1d, ConvTranspose1d -from torch.nn.utils import weight_norm, remove_weight_norm -from torch.nn.utils.weight_norm import WeightNorm - -from .activations import SnakeBeta -from .alias_free_torch import * - - - -LRELU_SLOPE = 0.1 - -logger = logging.getLogger(__name__) - - -def get_padding(kernel_size, dilation=1): - return int((kernel_size*dilation - dilation)/2) - - -def init_weights(m, mean=0.0, std=0.01): - classname = m.__class__.__name__ - if classname.find("Conv") != -1: - m.weight.data.normal_(mean, std) - - -class AMPBlock1(torch.nn.Module): - def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): - super(AMPBlock1, self).__init__() - - self.convs1 = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], - padding=get_padding(kernel_size, dilation[0]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], - padding=get_padding(kernel_size, dilation[1]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], - padding=get_padding(kernel_size, dilation[2]))) - ]) - self.convs1.apply(init_weights) - - self.convs2 = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))) - ]) - self.convs2.apply(init_weights) - - self.num_layers = len(self.convs1) + len(self.convs2) # total number of conv layers - - self.activations = nn.ModuleList([ - Activation1d(activation=SnakeBeta(channels, alpha_logscale=True)) - for _ in range(self.num_layers) - ]) - - def forward(self, x): - acts1, acts2 = self.activations[::2], self.activations[1::2] - for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2): - xt = a1(x) - xt = c1(xt) - xt = a2(xt) - xt = c2(xt) - x = xt + x - - return x - - def set_weight_norm(self, enabled: bool): - weight_norm_fn = weight_norm if enabled else remove_weight_norm - for l in self.convs1: - weight_norm_fn(l) - for l in self.convs2: - weight_norm_fn(l) - - -class BigVGAN(nn.Module): - # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks. - - # We've got a model in prod that has the wrong hparams for this. It's simpler to add this check than to - # redistribute the model. - ignore_state_dict_unexpected = ("cond_layer.*",) - - def __init__(self): - super().__init__() - - input_dims = 80 - - upsample_rates = [10, 8, 4, 2] - upsample_kernel_sizes = [x * 2 for x in upsample_rates] - upsample_initial_channel = 1024 - - resblock_kernel_sizes = [3, 7, 11] - resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]] - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_rates) - - # pre conv - self.conv_pre = weight_norm(Conv1d(input_dims, upsample_initial_channel, 7, 1, padding=3)) - self.cond_layer = None - - # transposed conv-based upsamplers. does not apply anti-aliasing - self.ups = nn.ModuleList() - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - self.ups.append(nn.ModuleList([ - weight_norm(ConvTranspose1d(upsample_initial_channel // (2 ** i), - upsample_initial_channel // (2 ** (i + 1)), - k, u, padding=(k - u) // 2)) - ])) - - # residual blocks using anti-aliased multi-periodicity composition modules (AMP) - self.resblocks = nn.ModuleList() - for i in range(len(self.ups)): - ch = upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): - self.resblocks.append(AMPBlock1(ch, k, d)) - - # post conv - activation_post = SnakeBeta(ch, alpha_logscale=True) - self.activation_post = Activation1d(activation=activation_post) - self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) - - # weight initialization - for i in range(len(self.ups)): - self.ups[i].apply(init_weights) - self.conv_post.apply(init_weights) - - def forward(self, x) -> torch.Tensor: - """ - Args - ---- - x: torch.Tensor of shape [B, T, C] - """ - with torch.inference_mode(): - - x = self.conv_pre(x) - - for i in range(self.num_upsamples): - # upsampling - for i_up in range(len(self.ups[i])): - x = self.ups[i][i_up](x) - # AMP blocks - xs = None - for j in range(self.num_kernels): - if xs is None: - xs = self.resblocks[i * self.num_kernels + j](x) - else: - xs += self.resblocks[i * self.num_kernels + j](x) - x = xs / self.num_kernels - - # post conv - x = self.activation_post(x) - x = self.conv_post(x) - - # Bound the output to [-1, 1] - x = torch.tanh(x) - - return x - - @property - def weight_norm_enabled(self) -> bool: - return any( - isinstance(hook, WeightNorm) and hook.name == "weight" - for k, hook in self.conv_pre._forward_pre_hooks.items() - ) - - def set_weight_norm(self, enabled: bool): - """ - N.B.: weight norm modifies the state dict, causing incompatibilities. Conventions: - - BigVGAN runs with weight norm for training, without for inference (done automatically by instantiate()) - - All checkpoints are saved with weight norm (allows resuming training) - """ - if enabled != self.weight_norm_enabled: - weight_norm_fn = weight_norm if enabled else remove_weight_norm - logger.debug(f"{'Applying' if enabled else 'Removing'} weight norm...") - - for l in self.ups: - for l_i in l: - weight_norm_fn(l_i) - for l in self.resblocks: - l.set_weight_norm(enabled) - weight_norm_fn(self.conv_pre) - weight_norm_fn(self.conv_post) - - def train_mode(self): - self.train() - self.set_weight_norm(enabled=True) - - def inference_mode(self): - self.eval() - self.set_weight_norm(enabled=False) - - -if __name__ == '__main__': - import sys - import soundfile as sf - model = BigVGAN() - - state_dict = torch.load("bigvgan32k.pt") - msg = model.load_state_dict(state_dict) - model.eval() - model.set_weight_norm(enabled=False) - - print(msg) - mels = torch.load("mels.pt") - with torch.inference_mode(): - y = model(mels.cpu()) - - for i, wav in enumerate(y): - wav = wav.view(-1).detach().numpy() - sf.write(f"bigvgan_test{i}.flac", wav, samplerate=32_000, format="FLAC") diff --git a/orator/src/orator/models/s3gen/__pycache__/__init__.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 5d342fa7f91de42eb90c2f718e96aee92e8a508b..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/__pycache__/const.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/const.cpython-311.pyc deleted file mode 100644 index e48f54f1c8576d6a38cc373a299e3db210217574..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/__pycache__/const.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/__pycache__/decoder.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/decoder.cpython-311.pyc deleted file mode 100644 index ec011f3ab10de9b0af8106bdc40f7d899bdb0ea2..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/__pycache__/decoder.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc deleted file mode 100644 index 7d48cc3d1d19db9d44c7181067ab5603ec06554d..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/__pycache__/flow.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/flow.cpython-311.pyc deleted file mode 100644 index 87974e30b64cf53f11a44dbfd7e98b9e9aedfb92..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/__pycache__/flow.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/__pycache__/flow_matching.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/flow_matching.cpython-311.pyc deleted file mode 100644 index 595dfad5532ed2b6585ed8a7c0a63ab3de713f74..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/__pycache__/flow_matching.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/__pycache__/hifigan.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/hifigan.cpython-311.pyc deleted file mode 100644 index 2efcda58d717d84fa19895f595c60569bd871aae..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/__pycache__/hifigan.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/__pycache__/s3gen.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/s3gen.cpython-311.pyc deleted file mode 100644 index a9c95f44491c7e486960719e2ca6e1bc81e4896d..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/__pycache__/s3gen.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/__pycache__/xvector.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/xvector.cpython-311.pyc deleted file mode 100644 index 75038ede7bcb6e855476701b9b49005babe03412..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/__pycache__/xvector.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc b/orator/src/orator/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc deleted file mode 100644 index a4720cffdda43eddaa412a82e32a28b2c4da0fd9..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc b/orator/src/orator/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc deleted file mode 100644 index cc8c38ede378987f616ba398ba51b6856825de33..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc b/orator/src/orator/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc deleted file mode 100644 index cd888d39388ea076c44e8209dba94d354d760d97..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index fb5c4d5f281b7e7a94b55d890414723c172333cd..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc deleted file mode 100644 index b8388f1425fc6faddad43b710f0c58d5c374f58a..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc deleted file mode 100644 index cfcd257b733783a40035b0c8fbe62d0df2409c20..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc deleted file mode 100644 index d7cbc1355320d0b1cdcbb187e80b7b78d9802453..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc deleted file mode 100644 index 619023d20d5831a83d777d768eb5876bebfff72b..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc deleted file mode 100644 index 051a22fa4053a02ce51f1e6768a97acb3a8bd6f1..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc deleted file mode 100644 index ad9e6ed4c9925c561c6c1f7026478bffa7ab2ab2..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc deleted file mode 100644 index 44b67d483ae3c70b9b144f5030b68877eb075393..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc deleted file mode 100644 index fcf1cec2735b03a6b2442bcf57003ac624ea552f..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc b/orator/src/orator/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc deleted file mode 100644 index 4b8cc4ebe7e8779f376df268307fc0fa20d4fb20..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/utils/__pycache__/mask.cpython-311.pyc b/orator/src/orator/models/s3gen/utils/__pycache__/mask.cpython-311.pyc deleted file mode 100644 index d3bd7c9f20a0c4b75cff9d1ccc723c2839045706..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/utils/__pycache__/mask.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3gen/utils/__pycache__/mel.cpython-311.pyc b/orator/src/orator/models/s3gen/utils/__pycache__/mel.cpython-311.pyc deleted file mode 100644 index 42e63bec8db32bb9a5349b24e752bbbfb9c288f9..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3gen/utils/__pycache__/mel.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc b/orator/src/orator/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 21105ed2e9931207e586ab4020974f13613385f9..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc b/orator/src/orator/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc deleted file mode 100644 index ec1a2cb71573c30c0b17137a662f4d22009f804d..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/t3/__pycache__/__init__.cpython-311.pyc b/orator/src/orator/models/t3/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 4955aa017c54594b34bd714a7ab635edc6c7f0a3..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/t3/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/t3/__pycache__/llama_configs.cpython-311.pyc b/orator/src/orator/models/t3/__pycache__/llama_configs.cpython-311.pyc deleted file mode 100644 index c2e37f3930829ce356a9e01fd75a51a856624f0f..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/t3/__pycache__/llama_configs.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/t3/__pycache__/t3.cpython-311.pyc b/orator/src/orator/models/t3/__pycache__/t3.cpython-311.pyc deleted file mode 100644 index d70491dab33ea926ac13d89f18800ca491b1fa0e..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/t3/__pycache__/t3.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc b/orator/src/orator/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc deleted file mode 100644 index e72820205c9dc263715ccbcdef1321f7644b7d32..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc b/orator/src/orator/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc deleted file mode 100644 index 4b99ee1fb64a8d54d165437d566e7cf64223e3cb..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc b/orator/src/orator/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc deleted file mode 100644 index 31256b321e52e439fc42a8d4bcccad9a76eeb7c6..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/t3/modules/__pycache__/perceiver.cpython-311.pyc b/orator/src/orator/models/t3/modules/__pycache__/perceiver.cpython-311.pyc deleted file mode 100644 index 935c51d564eab2bf7d6870592d7b4020a1d8f9d5..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/t3/modules/__pycache__/perceiver.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/t3/modules/__pycache__/t3_config.cpython-311.pyc b/orator/src/orator/models/t3/modules/__pycache__/t3_config.cpython-311.pyc deleted file mode 100644 index 31beca9bd0a70041f29c21a2145f09cb06426211..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/t3/modules/__pycache__/t3_config.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/tokenizers/__pycache__/__init__.cpython-311.pyc b/orator/src/orator/models/tokenizers/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index fad1839ea6e3aba23d1aece28f34443d95f441c8..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/tokenizers/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc b/orator/src/orator/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc deleted file mode 100644 index 22fff1b2973015e0489db75db537633e880c2c98..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/voice_encoder/__pycache__/__init__.cpython-311.pyc b/orator/src/orator/models/voice_encoder/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index f3a8c6b767936a3466e9ede2182d17a5e87de5ff..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/voice_encoder/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc b/orator/src/orator/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc deleted file mode 100644 index f7e8414b8ab2656a531547f37a18d85c88628fc1..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/transforms/__pycache__/spectrogram.cpython-311.pyc b/orator/src/orator/transforms/__pycache__/spectrogram.cpython-311.pyc deleted file mode 100644 index b5ec09740da2846bda8787e4ae5c8d64d4d9989a..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/transforms/__pycache__/spectrogram.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/transforms/__pycache__/syn_transforms.cpython-311.pyc b/orator/src/orator/transforms/__pycache__/syn_transforms.cpython-311.pyc deleted file mode 100644 index 2f220a91e3eec386e62a21db3d03ca6648cb527f..0000000000000000000000000000000000000000 Binary files a/orator/src/orator/transforms/__pycache__/syn_transforms.cpython-311.pyc and /dev/null differ diff --git a/orator/src/orator/transforms/spectrogram.py b/orator/src/orator/transforms/spectrogram.py deleted file mode 100644 index 69147fc8c591c9364ff829a157af0ea3fcbd5770..0000000000000000000000000000000000000000 --- a/orator/src/orator/transforms/spectrogram.py +++ /dev/null @@ -1,78 +0,0 @@ -from functools import lru_cache - -from scipy import signal -import numpy as np -import librosa - - -@lru_cache() -def mel_basis(hp): - assert hp.fmax <= hp.sample_rate // 2 - return librosa.filters.mel( - sr=hp.sample_rate, - n_fft=hp.n_fft, - n_mels=hp.num_mels, - fmin=hp.fmin, - fmax=hp.fmax) # -> (nmel, nfreq) - - -def preemphasis(wav, hp): - assert hp.preemphasis != 0 - wav = signal.lfilter([1, -hp.preemphasis], [1], wav) - wav = np.clip(wav, -1, 1) - return wav - - -def melspectrogram(wav, hp, pad=True): - # Run through pre-emphasis - if hp.preemphasis > 0: - wav = preemphasis(wav, hp) - assert np.abs(wav).max() - 1 < 1e-07 - - # Do the stft - spec_complex = _stft(wav, hp, pad=pad) - - # Get the magnitudes - spec_magnitudes = np.abs(spec_complex) - - if hp.mel_power != 1.0: - spec_magnitudes **= hp.mel_power - - # Get the mel and convert magnitudes->db - mel = np.dot(mel_basis(hp), spec_magnitudes) - if hp.mel_type == "db": - mel = _amp_to_db(mel, hp) - - # Normalise the mel from db to 0,1 - if hp.normalized_mels: - mel = _normalize(mel, hp).astype(np.float32) - - assert not pad or mel.shape[1] == 1 + len(wav) // hp.hop_size # Sanity check - return mel # (M, T) - - -def _stft(y, hp, pad=True): - # NOTE: after 0.8, pad mode defaults to constant, setting this to reflect for - # historical consistency and streaming-version consistency - return librosa.stft( - y, - n_fft=hp.n_fft, - hop_length=hp.hop_size, - win_length=hp.win_size, - center=pad, - pad_mode="reflect", - ) - - -def _amp_to_db(x, hp): - return 20 * np.log10(np.maximum(hp.stft_magnitude_min, x)) - - -def _db_to_amp(x): - return np.power(10.0, x * 0.05) - - -def _normalize(s, hp, headroom_db=15): - min_level_db = 20 * np.log10(hp.stft_magnitude_min) - s = (s - min_level_db) / (-min_level_db + headroom_db) - return s diff --git a/orator/src/orator/transforms/syn_transforms.py b/orator/src/orator/transforms/syn_transforms.py deleted file mode 100644 index 13ce597ae05503ef618b2de9b6c7b833f94409cb..0000000000000000000000000000000000000000 --- a/orator/src/orator/transforms/syn_transforms.py +++ /dev/null @@ -1,46 +0,0 @@ -# Common transformations used by synthesizers -import logging - -import numpy as np -import torch - - -logger = logging.getLogger(__name__) - - -def pack(arrays, seq_len: int=None, pad_value=0): - """ - Given a list of length B of array-like objects of shapes (Ti, ...), packs them in a single tensor of - shape (B, T, ...) by padding each individual array on the right. - - :param arrays: a list of array-like objects of matching shapes except for the first axis. - :param seq_len: the value of T. It must be the maximum of the lengths Ti of the arrays at - minimum. Will default to that value if None. - :param pad_value: the value to pad the arrays with. - :return: a (B, T, ...) tensor - """ - if seq_len is None: - seq_len = max(len(array) for array in arrays) - else: - assert seq_len >= max(len(array) for array in arrays) - - # Convert lists to np.array - if isinstance(arrays[0], list): - arrays = [np.array(array) for array in arrays] - - # Convert to tensor and handle device - device = None - if isinstance(arrays[0], torch.Tensor): - tensors = arrays - device = tensors[0].device - else: - tensors = [torch.as_tensor(array) for array in arrays] - - # Fill the packed tensor with the array data - packed_shape = (len(tensors), seq_len, *tensors[0].shape[1:]) - packed_tensor = torch.full(packed_shape, pad_value, dtype=tensors[0].dtype, device=device) - - for i, tensor in enumerate(tensors): - packed_tensor[i, :tensor.size(0)] = tensor - - return packed_tensor diff --git a/orator/src/orator/transforms/webrtc.py b/orator/src/orator/transforms/webrtc.py deleted file mode 100644 index c3d3abf97f27ac7f5c51ea7228920a0c522ed934..0000000000000000000000000000000000000000 --- a/orator/src/orator/transforms/webrtc.py +++ /dev/null @@ -1,181 +0,0 @@ -from itertools import groupby - -import numpy as np -import webrtcvad as _webrtcvad - -from transforms.vad.vad_stream import VADStream -from transforms.wav_encoding import encode_pcm16 - -# The sample rate the algo can operate at -_WEBRTC_SAMPLE_RATES = np.array([8000, 16000, 32000, 48000]) -# The algo operates with window sizes of 10, 20 and 30ms -_WEBRTC_WINDOW_SIZES_MS = (10, 20, 30) -# Greatest common divisor and lowest common multiple of the above -_WEBRTC_WINDOW_SIZES_MS_GCD = 10 -_WEBRTC_WINDOW_SIZES_MS_LCM = 60 - - -class WebRTCVADStream(VADStream): - def __init__(self, sample_rate: int, aggressiveness=2, dilation_ms=40, min_voiced_region_ms=125): - """ - :param sample_rate: sample rate of the wavs that will be passed - :param aggressiveness: parameter for controlling the aggressiveness of the VAD algo. Possible values are 1, - 2 and 3. Higher means less regions will be detected as voiced. - :param dilation_ms: pass a value greater than 0 to include regions directly preceding or succeeding voiced - regions. Voiced regions will be expanded left and right by this value, in milliseconds. - N.B.: this is a best effort parameter. When the output is requested as fast as the input is produced, - it's impossible to foresee an upcoming voiced region. In that case, the dilation on the left of that region - may not appear. - :param min_voiced_region_ms: to exclude regions detected as speech that are considered too short, pass a value - greater than 0. Voiced regions shorter than this value (prior to dilation) will be set as unvoiced. - N.B.: this is also a best effort parameter. A region may be too short, but because VAD has not finished - being computed at the end of that region, it won't be removed as it could potentially be large enough. - """ - webrtc_sr = int(_WEBRTC_SAMPLE_RATES[np.argmin(np.abs(_WEBRTC_SAMPLE_RATES - sample_rate))]) - lcm_win_size = (_WEBRTC_WINDOW_SIZES_MS_LCM * webrtc_sr) // 1000 - self._gcd_win_size = (_WEBRTC_WINDOW_SIZES_MS_GCD * webrtc_sr) // 1000 - - # webrtcvad.Vad is stateful, predictions will be impacted if a new instance is created halfway through an - # audio. This is why we create them now. - self._detectors = {win_size: _webrtcvad.Vad(mode=aggressiveness) for win_size in _WEBRTC_WINDOW_SIZES_MS} - - super().__init__(sample_rate, webrtc_sr, lcm_win_size, dilation_ms, min_voiced_region_ms) - - def _wav_vad(self, wav: np.ndarray) -> np.ndarray: - pcm = encode_pcm16(wav) - - # Perform the VAD by ensembling the different window sizes - win_vad = np.zeros(len(wav) // self._gcd_win_size, dtype=np.int32) - for sub_win_size_ms in _WEBRTC_WINDOW_SIZES_MS: - detector = self._detectors[sub_win_size_ms] - sub_win_size_pcm = (2 * sub_win_size_ms * self.vad_sr) // 1000 - factor = sub_win_size_ms // _WEBRTC_WINDOW_SIZES_MS_GCD - - for i, win_start in enumerate(range(0, len(pcm), sub_win_size_pcm)): - win_i_vad = detector.is_speech(pcm[win_start:win_start + sub_win_size_pcm], self.vad_sr) - win_vad[i * factor:(i + 1) * factor] += win_i_vad - win_vad = win_vad > (len(_WEBRTC_WINDOW_SIZES_MS) // 2) - - # Convert the output to regions - regions = np.diff(win_vad, prepend=0, append=0).nonzero()[0].reshape(-1, 2) - regions = regions * (len(wav) // len(win_vad)) - - return regions - - -def webrtc_vad(wav: np.ndarray, source_sr: int, aggressiveness=2, dilation_ms=40, min_voiced_region_ms=125): - """ - Peforms Voice Activation Detection on a single audio. See WebrtcVADStream for more details. - - :return vad: a boolean numpy array of length equal to - """ - vad_stream = WebRTCVADStream(source_sr, aggressiveness, dilation_ms, min_voiced_region_ms) - vad_stream.feed(wav, close_input=True) - if vad_stream.can_step(): - return vad_stream.step(len(wav)) - else: - return np.zeros_like(wav, dtype=bool) - - -def split_on_silence( - wav, sr, vad, thresholds_ms=[500, 300, 200, 100, 50], min_dur_s=1.5, max_split_dur_s=20, max_dur_s=30, -): - """ - Split a wav into chunks, splitting on silence when the length of the silence exceeds a threshold. - Args: - wav: 1d-array - sr: sample rate - thresholds_ms: min length of silence to split on, clips are recursively split using values from this list until - the resulting chunks are all within the min / max duration bounds - min_dur_s: minimum duration of a chunk in seconds - max_split_dur_s: segments above this length are continue to be split down with smaller thesholds - max_dur_s: maximum duration of a chunk in seconds - """ - assert isinstance(wav, np.ndarray) and wav.ndim == 1 - - # unpack silence length thresholds - thresh_ms, next_thresh_ms = (thresholds_ms + [0, 0])[:2] - if thresh_ms <= 0: - return [wav] - - # convert thresholds to samples - max_split_dur_s = min(max_split_dur_s, max_dur_s) - thresh = int(thresh_ms * sr / 1000) - min_len = int(min_dur_s * sr) - max_split_len = int(max_split_dur_s * sr) - max_len = int(max_dur_s * sr) - wav_len = len(wav) - - # detect regions of silence using groupby - sil_regions = [] - for is_voiced, idxs in groupby(range(wav_len), key=vad.__getitem__): - idxs = list(idxs) - i = idxs[0] - j = idxs[-1] - j += 1 - n = j - i - mid = (i + j) // 2 - - # record split point if this is a long silence region - if (not is_voiced) and n > thresh: - sil_regions += [( - min(mid, i + (0 if i == 0 else thresh // 2)), - max(mid, j - (0 if j == wav_len else thresh // 2)), - )] - - # invert silence regions to get voiced regions - ptr = 0 - voiced_regions = [] - for i, j in sil_regions: - if i > 0: - voiced_regions += [(ptr, i)] - ptr = j - if ptr < wav_len: - voiced_regions += [(ptr, wav_len)] - - # split the waveform into chunks using the detected content bounds and silence split points - chunks = [] - for i, j in voiced_regions: - chunk = wav[i:j] - chunklen = len(chunk) - - # chunk is within bounds - if chunklen < max_split_len: - chunks += [chunk] - - # chunk is too long, attempt to split it recursively using threshold list - elif next_thresh_ms > 0: - chunks += split_on_silence( - chunk, sr, vad[i:j], thresholds_ms=thresholds_ms[1:], - min_dur_s=min_dur_s, max_dur_s=max_dur_s, - ) - - # NOTE: keeping chunks longer than `max_len` here, filtering is done below - else: - chunks += [chunk] - - # merge short chunks - merged_chunks = [] - for chunk in chunks: - chunklen = len(chunk) - - # chunk is too short, add it to the previous chunk if possible - if chunklen == 0: - continue - - elif chunklen < min_len: - # NOTE: ignore the edge case where this would make the previous chunk too long, by just dropping this chunk - if len(merged_chunks) > 0 and len(merged_chunks[-1]) + chunklen < max_len: - merged_chunks[-1] = np.concatenate([merged_chunks[-1], chunk]) - - elif chunklen < max_len: - merged_chunks += [chunk] - - else: - # TODO: keep long chunks as well? one benefit is to keep the adjascent ordering of chunks, for - # building paragraph-level datasets. However, this should rarely drop any clips, so it's probably okay. - # merged_chunks += [chunk] - pass - chunks = merged_chunks - - return chunks