diff --git a/chatterbox/src/chatterbox/__init__.py b/chatterbox/src/chatterbox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ad4d015e2d30bb7e1f053f5d692f8bdf67014ed
--- /dev/null
+++ b/chatterbox/src/chatterbox/__init__.py
@@ -0,0 +1 @@
+from .tts import ChatterboxTTS
\ No newline at end of file
diff --git a/orator/src/orator/models/s3gen/__init__.py b/chatterbox/src/chatterbox/models/s3gen/__init__.py
similarity index 100%
rename from orator/src/orator/models/s3gen/__init__.py
rename to chatterbox/src/chatterbox/models/s3gen/__init__.py
diff --git a/orator/src/orator/models/s3gen/const.py b/chatterbox/src/chatterbox/models/s3gen/const.py
similarity index 100%
rename from orator/src/orator/models/s3gen/const.py
rename to chatterbox/src/chatterbox/models/s3gen/const.py
diff --git a/orator/src/orator/models/s3gen/decoder.py b/chatterbox/src/chatterbox/models/s3gen/decoder.py
similarity index 100%
rename from orator/src/orator/models/s3gen/decoder.py
rename to chatterbox/src/chatterbox/models/s3gen/decoder.py
diff --git a/orator/src/orator/models/s3gen/f0_predictor.py b/chatterbox/src/chatterbox/models/s3gen/f0_predictor.py
similarity index 100%
rename from orator/src/orator/models/s3gen/f0_predictor.py
rename to chatterbox/src/chatterbox/models/s3gen/f0_predictor.py
diff --git a/orator/src/orator/models/s3gen/flow.py b/chatterbox/src/chatterbox/models/s3gen/flow.py
similarity index 100%
rename from orator/src/orator/models/s3gen/flow.py
rename to chatterbox/src/chatterbox/models/s3gen/flow.py
diff --git a/orator/src/orator/models/s3gen/flow_matching.py b/chatterbox/src/chatterbox/models/s3gen/flow_matching.py
similarity index 100%
rename from orator/src/orator/models/s3gen/flow_matching.py
rename to chatterbox/src/chatterbox/models/s3gen/flow_matching.py
diff --git a/orator/src/orator/models/s3gen/hifigan.py b/chatterbox/src/chatterbox/models/s3gen/hifigan.py
similarity index 100%
rename from orator/src/orator/models/s3gen/hifigan.py
rename to chatterbox/src/chatterbox/models/s3gen/hifigan.py
diff --git a/orator/src/orator/models/s3gen/matcha/decoder.py b/chatterbox/src/chatterbox/models/s3gen/matcha/decoder.py
similarity index 100%
rename from orator/src/orator/models/s3gen/matcha/decoder.py
rename to chatterbox/src/chatterbox/models/s3gen/matcha/decoder.py
diff --git a/orator/src/orator/models/s3gen/matcha/flow_matching.py b/chatterbox/src/chatterbox/models/s3gen/matcha/flow_matching.py
similarity index 100%
rename from orator/src/orator/models/s3gen/matcha/flow_matching.py
rename to chatterbox/src/chatterbox/models/s3gen/matcha/flow_matching.py
diff --git a/orator/src/orator/models/s3gen/matcha/text_encoder.py b/chatterbox/src/chatterbox/models/s3gen/matcha/text_encoder.py
similarity index 100%
rename from orator/src/orator/models/s3gen/matcha/text_encoder.py
rename to chatterbox/src/chatterbox/models/s3gen/matcha/text_encoder.py
diff --git a/orator/src/orator/models/s3gen/matcha/transformer.py b/chatterbox/src/chatterbox/models/s3gen/matcha/transformer.py
similarity index 100%
rename from orator/src/orator/models/s3gen/matcha/transformer.py
rename to chatterbox/src/chatterbox/models/s3gen/matcha/transformer.py
diff --git a/orator/src/orator/models/s3gen/s3gen.py b/chatterbox/src/chatterbox/models/s3gen/s3gen.py
similarity index 100%
rename from orator/src/orator/models/s3gen/s3gen.py
rename to chatterbox/src/chatterbox/models/s3gen/s3gen.py
diff --git a/orator/src/orator/models/s3gen/transformer/__init__.py b/chatterbox/src/chatterbox/models/s3gen/transformer/__init__.py
similarity index 100%
rename from orator/src/orator/models/s3gen/transformer/__init__.py
rename to chatterbox/src/chatterbox/models/s3gen/transformer/__init__.py
diff --git a/orator/src/orator/models/s3gen/transformer/activation.py b/chatterbox/src/chatterbox/models/s3gen/transformer/activation.py
similarity index 100%
rename from orator/src/orator/models/s3gen/transformer/activation.py
rename to chatterbox/src/chatterbox/models/s3gen/transformer/activation.py
diff --git a/orator/src/orator/models/s3gen/transformer/attention.py b/chatterbox/src/chatterbox/models/s3gen/transformer/attention.py
similarity index 100%
rename from orator/src/orator/models/s3gen/transformer/attention.py
rename to chatterbox/src/chatterbox/models/s3gen/transformer/attention.py
diff --git a/orator/src/orator/models/s3gen/transformer/convolution.py b/chatterbox/src/chatterbox/models/s3gen/transformer/convolution.py
similarity index 100%
rename from orator/src/orator/models/s3gen/transformer/convolution.py
rename to chatterbox/src/chatterbox/models/s3gen/transformer/convolution.py
diff --git a/orator/src/orator/models/s3gen/transformer/embedding.py b/chatterbox/src/chatterbox/models/s3gen/transformer/embedding.py
similarity index 100%
rename from orator/src/orator/models/s3gen/transformer/embedding.py
rename to chatterbox/src/chatterbox/models/s3gen/transformer/embedding.py
diff --git a/orator/src/orator/models/s3gen/transformer/encoder_layer.py b/chatterbox/src/chatterbox/models/s3gen/transformer/encoder_layer.py
similarity index 100%
rename from orator/src/orator/models/s3gen/transformer/encoder_layer.py
rename to chatterbox/src/chatterbox/models/s3gen/transformer/encoder_layer.py
diff --git a/orator/src/orator/models/s3gen/transformer/positionwise_feed_forward.py b/chatterbox/src/chatterbox/models/s3gen/transformer/positionwise_feed_forward.py
similarity index 100%
rename from orator/src/orator/models/s3gen/transformer/positionwise_feed_forward.py
rename to chatterbox/src/chatterbox/models/s3gen/transformer/positionwise_feed_forward.py
diff --git a/orator/src/orator/models/s3gen/transformer/subsampling.py b/chatterbox/src/chatterbox/models/s3gen/transformer/subsampling.py
similarity index 100%
rename from orator/src/orator/models/s3gen/transformer/subsampling.py
rename to chatterbox/src/chatterbox/models/s3gen/transformer/subsampling.py
diff --git a/orator/src/orator/models/s3gen/transformer/upsample_encoder.py b/chatterbox/src/chatterbox/models/s3gen/transformer/upsample_encoder.py
similarity index 100%
rename from orator/src/orator/models/s3gen/transformer/upsample_encoder.py
rename to chatterbox/src/chatterbox/models/s3gen/transformer/upsample_encoder.py
diff --git a/orator/src/orator/models/s3gen/utils/class_utils.py b/chatterbox/src/chatterbox/models/s3gen/utils/class_utils.py
similarity index 100%
rename from orator/src/orator/models/s3gen/utils/class_utils.py
rename to chatterbox/src/chatterbox/models/s3gen/utils/class_utils.py
diff --git a/orator/src/orator/models/s3gen/utils/mask.py b/chatterbox/src/chatterbox/models/s3gen/utils/mask.py
similarity index 100%
rename from orator/src/orator/models/s3gen/utils/mask.py
rename to chatterbox/src/chatterbox/models/s3gen/utils/mask.py
diff --git a/orator/src/orator/models/s3gen/utils/mel.py b/chatterbox/src/chatterbox/models/s3gen/utils/mel.py
similarity index 100%
rename from orator/src/orator/models/s3gen/utils/mel.py
rename to chatterbox/src/chatterbox/models/s3gen/utils/mel.py
diff --git a/orator/src/orator/models/s3gen/xvector.py b/chatterbox/src/chatterbox/models/s3gen/xvector.py
similarity index 100%
rename from orator/src/orator/models/s3gen/xvector.py
rename to chatterbox/src/chatterbox/models/s3gen/xvector.py
diff --git a/orator/src/orator/models/s3tokenizer/__init__.py b/chatterbox/src/chatterbox/models/s3tokenizer/__init__.py
similarity index 100%
rename from orator/src/orator/models/s3tokenizer/__init__.py
rename to chatterbox/src/chatterbox/models/s3tokenizer/__init__.py
diff --git a/orator/src/orator/models/s3tokenizer/s3tokenizer.py b/chatterbox/src/chatterbox/models/s3tokenizer/s3tokenizer.py
similarity index 100%
rename from orator/src/orator/models/s3tokenizer/s3tokenizer.py
rename to chatterbox/src/chatterbox/models/s3tokenizer/s3tokenizer.py
diff --git a/orator/src/orator/models/t3/__init__.py b/chatterbox/src/chatterbox/models/t3/__init__.py
similarity index 100%
rename from orator/src/orator/models/t3/__init__.py
rename to chatterbox/src/chatterbox/models/t3/__init__.py
diff --git a/orator/src/orator/models/t3/inference/alignment_stream_analyzer.py b/chatterbox/src/chatterbox/models/t3/inference/alignment_stream_analyzer.py
similarity index 100%
rename from orator/src/orator/models/t3/inference/alignment_stream_analyzer.py
rename to chatterbox/src/chatterbox/models/t3/inference/alignment_stream_analyzer.py
diff --git a/orator/src/orator/models/t3/inference/t3_hf_backend.py b/chatterbox/src/chatterbox/models/t3/inference/t3_hf_backend.py
similarity index 100%
rename from orator/src/orator/models/t3/inference/t3_hf_backend.py
rename to chatterbox/src/chatterbox/models/t3/inference/t3_hf_backend.py
diff --git a/orator/src/orator/models/t3/llama_configs.py b/chatterbox/src/chatterbox/models/t3/llama_configs.py
similarity index 100%
rename from orator/src/orator/models/t3/llama_configs.py
rename to chatterbox/src/chatterbox/models/t3/llama_configs.py
diff --git a/orator/src/orator/models/t3/modules/cond_enc.py b/chatterbox/src/chatterbox/models/t3/modules/cond_enc.py
similarity index 100%
rename from orator/src/orator/models/t3/modules/cond_enc.py
rename to chatterbox/src/chatterbox/models/t3/modules/cond_enc.py
diff --git a/orator/src/orator/models/t3/modules/learned_pos_emb.py b/chatterbox/src/chatterbox/models/t3/modules/learned_pos_emb.py
similarity index 100%
rename from orator/src/orator/models/t3/modules/learned_pos_emb.py
rename to chatterbox/src/chatterbox/models/t3/modules/learned_pos_emb.py
diff --git a/orator/src/orator/models/t3/modules/perceiver.py b/chatterbox/src/chatterbox/models/t3/modules/perceiver.py
similarity index 100%
rename from orator/src/orator/models/t3/modules/perceiver.py
rename to chatterbox/src/chatterbox/models/t3/modules/perceiver.py
diff --git a/orator/src/orator/models/t3/modules/t3_config.py b/chatterbox/src/chatterbox/models/t3/modules/t3_config.py
similarity index 100%
rename from orator/src/orator/models/t3/modules/t3_config.py
rename to chatterbox/src/chatterbox/models/t3/modules/t3_config.py
diff --git a/orator/src/orator/models/t3/t3.py b/chatterbox/src/chatterbox/models/t3/t3.py
similarity index 100%
rename from orator/src/orator/models/t3/t3.py
rename to chatterbox/src/chatterbox/models/t3/t3.py
diff --git a/orator/src/orator/models/tokenizers/__init__.py b/chatterbox/src/chatterbox/models/tokenizers/__init__.py
similarity index 100%
rename from orator/src/orator/models/tokenizers/__init__.py
rename to chatterbox/src/chatterbox/models/tokenizers/__init__.py
diff --git a/orator/src/orator/models/tokenizers/tokenizer.py b/chatterbox/src/chatterbox/models/tokenizers/tokenizer.py
similarity index 100%
rename from orator/src/orator/models/tokenizers/tokenizer.py
rename to chatterbox/src/chatterbox/models/tokenizers/tokenizer.py
diff --git a/orator/src/orator/models/voice_encoder/__init__.py b/chatterbox/src/chatterbox/models/voice_encoder/__init__.py
similarity index 100%
rename from orator/src/orator/models/voice_encoder/__init__.py
rename to chatterbox/src/chatterbox/models/voice_encoder/__init__.py
diff --git a/orator/src/orator/models/voice_encoder/config.py b/chatterbox/src/chatterbox/models/voice_encoder/config.py
similarity index 100%
rename from orator/src/orator/models/voice_encoder/config.py
rename to chatterbox/src/chatterbox/models/voice_encoder/config.py
diff --git a/orator/src/orator/models/voice_encoder/melspec.py b/chatterbox/src/chatterbox/models/voice_encoder/melspec.py
similarity index 100%
rename from orator/src/orator/models/voice_encoder/melspec.py
rename to chatterbox/src/chatterbox/models/voice_encoder/melspec.py
diff --git a/orator/src/orator/models/voice_encoder/voice_encoder.py b/chatterbox/src/chatterbox/models/voice_encoder/voice_encoder.py
similarity index 100%
rename from orator/src/orator/models/voice_encoder/voice_encoder.py
rename to chatterbox/src/chatterbox/models/voice_encoder/voice_encoder.py
diff --git a/orator/src/orator/tts.py b/chatterbox/src/chatterbox/tts.py
similarity index 97%
rename from orator/src/orator/tts.py
rename to chatterbox/src/chatterbox/tts.py
index 5081c6075aa6c58003fd3116e85e35304dbbce40..8d2ceb64b2c7a033f7933d2272b01bd6e6652b1c 100644
--- a/orator/src/orator/tts.py
+++ b/chatterbox/src/chatterbox/tts.py
@@ -14,7 +14,7 @@ from .models.voice_encoder import VoiceEncoder
 from .models.t3.modules.cond_enc import T3Cond
 
 
-REPO_ID = "ResembleAI/Orator"
+REPO_ID = "ResembleAI/chatterbox"
 
 
 def change_pace(speech_tokens: torch.Tensor, pace: float):
@@ -68,7 +68,7 @@ class Conditionals:
         return cls(T3Cond(**kwargs['t3']), kwargs['gen'])
 
 
-class OratorTTS:
+class ChatterboxTTS:
     ENC_COND_LEN = 6 * S3_SR
     DEC_COND_LEN = 10 * S3GEN_SR
 
@@ -90,7 +90,7 @@ class OratorTTS:
         self.conds = conds
 
     @classmethod
-    def from_local(cls, ckpt_dir, device) -> 'OratorTTS':
+    def from_local(cls, ckpt_dir, device) -> 'ChatterboxTTS':
         ckpt_dir = Path(ckpt_dir)
 
         ve = VoiceEncoder()
@@ -122,7 +122,7 @@ class OratorTTS:
         return cls(t3, s3gen, ve, tokenizer, device, conds=conds)
 
     @classmethod
-    def from_pretrained(cls, device) -> 'OratorTTS':
+    def from_pretrained(cls, device) -> 'ChatterboxTTS':
         for fpath in ["ve.pt", "t3.pt", "s3gen.pt", "tokenizer.json", "conds.pt"]:
             local_path = hf_hub_download(repo_id=REPO_ID, filename=fpath)
 
diff --git a/orator/src/orator/vc.py b/chatterbox/src/chatterbox/vc.py
similarity index 93%
rename from orator/src/orator/vc.py
rename to chatterbox/src/chatterbox/vc.py
index df140b3bcd5b383b0d9a4417c764f9a31ea379d8..ea5ec21e25f671188e439f33307e5890421ff4af 100644
--- a/orator/src/orator/vc.py
+++ b/chatterbox/src/chatterbox/vc.py
@@ -8,10 +8,10 @@ from .models.s3tokenizer import S3_SR
 from .models.s3gen import S3GEN_SR, S3Gen
 
 
-REPO_ID = "ResembleAI/Orator"
+REPO_ID = "ResembleAI/chatterbox"
 
 
-class OratorVC:
+class ChatterboxVC:
     ENC_COND_LEN = 6 * S3_SR
     DEC_COND_LEN = 10 * S3GEN_SR
 
@@ -33,7 +33,7 @@ class OratorVC:
             }
 
     @classmethod
-    def from_local(cls, ckpt_dir, device) -> 'OratorVC':
+    def from_local(cls, ckpt_dir, device) -> 'ChatterboxVC':
         ckpt_dir = Path(ckpt_dir)
         ref_dict = None
         if (builtin_voice := ckpt_dir / "conds.pt").exists():
@@ -49,7 +49,7 @@ class OratorVC:
         return cls(s3gen, device, ref_dict=ref_dict)
 
     @classmethod
-    def from_pretrained(cls, device) -> 'OratorVC':
+    def from_pretrained(cls, device) -> 'ChatterboxVC':
         for fpath in ["s3gen.pt", "conds.pt"]:
             local_path = hf_hub_download(repo_id=REPO_ID, filename=fpath)
 
diff --git a/orator/src/orator.egg-info/PKG-INFO b/orator/src/orator.egg-info/PKG-INFO
deleted file mode 100644
index 8a2974074c3fca0ccd532f1159641db692426a02..0000000000000000000000000000000000000000
--- a/orator/src/orator.egg-info/PKG-INFO
+++ /dev/null
@@ -1,17 +0,0 @@
-Metadata-Version: 2.4
-Name: orator
-Version: 0.1
-Description-Content-Type: text/markdown
-Requires-Dist: numpy==1.26.0
-Requires-Dist: resampy==0.4.3
-Requires-Dist: librosa==0.10.0
-Requires-Dist: s3tokenizer
-Requires-Dist: torch==2.6.0
-Requires-Dist: torchaudio==2.6.0
-Requires-Dist: transformers==4.46.3
-Requires-Dist: diffusers==0.29.0
-Requires-Dist: omegaconf==2.3.0
-Requires-Dist: conformer==0.3.2
-
-# orator
-Open source TTS model
diff --git a/orator/src/orator.egg-info/SOURCES.txt b/orator/src/orator.egg-info/SOURCES.txt
deleted file mode 100644
index cada9c6ef8507252610ee20e70be2314eeb74cca..0000000000000000000000000000000000000000
--- a/orator/src/orator.egg-info/SOURCES.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-README.md
-pyproject.toml
-src/orator/__init__.py
-src/orator/model_checkpoints.py
-src/orator/tts.py
-src/orator.egg-info/PKG-INFO
-src/orator.egg-info/SOURCES.txt
-src/orator.egg-info/dependency_links.txt
-src/orator.egg-info/requires.txt
-src/orator.egg-info/top_level.txt
-src/orator/models/s3gen/__init__.py
-src/orator/models/s3gen/const.py
-src/orator/models/s3gen/decoder.py
-src/orator/models/s3gen/f0_predictor.py
-src/orator/models/s3gen/flow.py
-src/orator/models/s3gen/flow_matching.py
-src/orator/models/s3gen/hifigan.py
-src/orator/models/s3gen/s3gen.py
-src/orator/models/s3gen/xvector.py
-src/orator/models/s3gen/matcha/decoder.py
-src/orator/models/s3gen/matcha/flow_matching.py
-src/orator/models/s3gen/matcha/text_encoder.py
-src/orator/models/s3gen/matcha/transformer.py
-src/orator/models/s3gen/transformer/__init__.py
-src/orator/models/s3gen/transformer/activation.py
-src/orator/models/s3gen/transformer/attention.py
-src/orator/models/s3gen/transformer/convolution.py
-src/orator/models/s3gen/transformer/embedding.py
-src/orator/models/s3gen/transformer/encoder_layer.py
-src/orator/models/s3gen/transformer/positionwise_feed_forward.py
-src/orator/models/s3gen/transformer/subsampling.py
-src/orator/models/s3gen/transformer/upsample_encoder.py
-src/orator/models/s3gen/utils/class_utils.py
-src/orator/models/s3gen/utils/mask.py
-src/orator/models/s3gen/utils/mel.py
-src/orator/models/s3tokenizer/__init__.py
-src/orator/models/s3tokenizer/s3tokenizer.py
-src/orator/models/t3/__init__.py
-src/orator/models/t3/llama_configs.py
-src/orator/models/t3/t3.py
-src/orator/models/t3/inference/t3_hf_backend.py
-src/orator/models/t3/modules/cond_enc.py
-src/orator/models/t3/modules/learned_pos_emb.py
-src/orator/models/t3/modules/perceiver.py
-src/orator/models/t3/modules/t3_config.py
-src/orator/models/tokenizers/__init__.py
-src/orator/models/tokenizers/tokenizer.py
-src/orator/models/voice_encoder/__init__.py
-src/orator/models/voice_encoder/voice_encoder.py
-src/orator/transforms/spectrogram.py
-src/orator/transforms/syn_transforms.py
-src/orator/transforms/webrtc.py
\ No newline at end of file
diff --git a/orator/src/orator.egg-info/dependency_links.txt b/orator/src/orator.egg-info/dependency_links.txt
deleted file mode 100644
index 8b137891791fe96927ad78e64b0aad7bded08bdc..0000000000000000000000000000000000000000
--- a/orator/src/orator.egg-info/dependency_links.txt
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/orator/src/orator.egg-info/requires.txt b/orator/src/orator.egg-info/requires.txt
deleted file mode 100644
index d5214bc4cfaf3ee370117d092878abedde29e924..0000000000000000000000000000000000000000
--- a/orator/src/orator.egg-info/requires.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-numpy==1.26.0
-resampy==0.4.3
-librosa==0.10.0
-s3tokenizer
-torch==2.6.0
-torchaudio==2.6.0
-transformers==4.46.3
-diffusers==0.29.0
-omegaconf==2.3.0
-conformer==0.3.2
diff --git a/orator/src/orator.egg-info/top_level.txt b/orator/src/orator.egg-info/top_level.txt
deleted file mode 100644
index 043cdf6363c1d150134985603544ec5a33b0d53a..0000000000000000000000000000000000000000
--- a/orator/src/orator.egg-info/top_level.txt
+++ /dev/null
@@ -1 +0,0 @@
-orator
diff --git a/orator/src/orator/__init__.py b/orator/src/orator/__init__.py
deleted file mode 100644
index e2b13b095b627341de9c4a1e63ca91b180910ee3..0000000000000000000000000000000000000000
--- a/orator/src/orator/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .tts import OratorTTS
\ No newline at end of file
diff --git a/orator/src/orator/__pycache__/__init__.cpython-311.pyc b/orator/src/orator/__pycache__/__init__.cpython-311.pyc
deleted file mode 100644
index 4d73364ffc4777d01681ff959635ee6123e075a6..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/__pycache__/__init__.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/__pycache__/tts.cpython-311.pyc b/orator/src/orator/__pycache__/tts.cpython-311.pyc
deleted file mode 100644
index 090fe10720565fb5bc2b44a709c10af6512786bb..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/__pycache__/tts.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/model_checkpoints.py b/orator/src/orator/model_checkpoints.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/orator/src/orator/models/bigvgan/__pycache__/activations.cpython-311.pyc b/orator/src/orator/models/bigvgan/__pycache__/activations.cpython-311.pyc
deleted file mode 100644
index abdfafb4abc0f4c125638ebe9d4f456039bf68fc..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/bigvgan/__pycache__/activations.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/bigvgan/__pycache__/bigvgan.cpython-311.pyc b/orator/src/orator/models/bigvgan/__pycache__/bigvgan.cpython-311.pyc
deleted file mode 100644
index 9bc39b3811393b032fad3f25eb8c822ad831b6c0..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/bigvgan/__pycache__/bigvgan.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/bigvgan/activations.py b/orator/src/orator/models/bigvgan/activations.py
deleted file mode 100644
index 30a3c85145eb147e61331f9dbd5d2b3650146851..0000000000000000000000000000000000000000
--- a/orator/src/orator/models/bigvgan/activations.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
-#   LICENSE is in incl_licenses directory.
-
-import torch
-from torch import nn, sin, pow
-from torch.nn import Parameter
-
-
-class Snake(nn.Module):
-    '''
-    Implementation of a sine-based periodic activation function
-    Shape:
-        - Input: (B, C, T)
-        - Output: (B, C, T), same shape as the input
-    Parameters:
-        - alpha - trainable parameter
-    References:
-        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
-        https://arxiv.org/abs/2006.08195
-    Examples:
-        >>> a1 = snake(256)
-        >>> x = torch.randn(256)
-        >>> x = a1(x)
-    '''
-    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
-        '''
-        Initialization.
-        INPUT:
-            - in_features: shape of the input
-            - alpha: trainable parameter
-            alpha is initialized to 1 by default, higher values = higher-frequency.
-            alpha will be trained along with the rest of your model.
-        '''
-        super(Snake, self).__init__()
-        self.in_features = in_features
-
-        # initialize alpha
-        self.alpha_logscale = alpha_logscale
-        if self.alpha_logscale: # log scale alphas initialized to zeros
-            self.alpha = Parameter(torch.zeros(in_features) * alpha)
-        else: # linear scale alphas initialized to ones
-            self.alpha = Parameter(torch.ones(in_features) * alpha)
-
-        self.alpha.requires_grad = alpha_trainable
-
-        self.no_div_by_zero = 0.000000001
-
-    def forward(self, x):
-        '''
-        Forward pass of the function.
-        Applies the function to the input elementwise.
-        Snake ∶= x + 1/a * sin^2 (xa)
-        '''
-        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
-        if self.alpha_logscale:
-            alpha = torch.exp(alpha)
-        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
-
-        return x
-
-
-class SnakeBeta(nn.Module):
-    '''
-    A modified Snake function which uses separate parameters for the magnitude of the periodic components
-    Shape:
-        - Input: (B, C, T)
-        - Output: (B, C, T), same shape as the input
-    Parameters:
-        - alpha - trainable parameter that controls frequency
-        - beta - trainable parameter that controls magnitude
-    References:
-        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
-        https://arxiv.org/abs/2006.08195
-    Examples:
-        >>> a1 = snakebeta(256)
-        >>> x = torch.randn(256)
-        >>> x = a1(x)
-    '''
-    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
-        '''
-        Initialization.
-        INPUT:
-            - in_features: shape of the input
-            - alpha - trainable parameter that controls frequency
-            - beta - trainable parameter that controls magnitude
-            alpha is initialized to 1 by default, higher values = higher-frequency.
-            beta is initialized to 1 by default, higher values = higher-magnitude.
-            alpha will be trained along with the rest of your model.
-        '''
-        super(SnakeBeta, self).__init__()
-        self.in_features = in_features
-
-        # initialize alpha
-        self.alpha_logscale = alpha_logscale
-        if self.alpha_logscale: # log scale alphas initialized to zeros
-            self.alpha = Parameter(torch.zeros(in_features) * alpha)
-            self.beta = Parameter(torch.zeros(in_features) * alpha)
-        else: # linear scale alphas initialized to ones
-            self.alpha = Parameter(torch.ones(in_features) * alpha)
-            self.beta = Parameter(torch.ones(in_features) * alpha)
-
-        self.alpha.requires_grad = alpha_trainable
-        self.beta.requires_grad = alpha_trainable
-
-        self.no_div_by_zero = 0.000000001
-
-    def forward(self, x):
-        '''
-        Forward pass of the function.
-        Applies the function to the input elementwise.
-        SnakeBeta ∶= x + 1/b * sin^2 (xa)
-        '''
-        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
-        beta = self.beta.unsqueeze(0).unsqueeze(-1)
-        if self.alpha_logscale:
-            alpha = torch.exp(alpha)
-            beta = torch.exp(beta)
-        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
-
-        return x
diff --git a/orator/src/orator/models/bigvgan/alias_free_torch/__init__.py b/orator/src/orator/models/bigvgan/alias_free_torch/__init__.py
deleted file mode 100644
index 8f756ed83f87f9839e457b240f60469bc187707d..0000000000000000000000000000000000000000
--- a/orator/src/orator/models/bigvgan/alias_free_torch/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
-#   LICENSE is in incl_licenses directory.
-
-from .filter import *
-from .resample import *
-from .act import *
diff --git a/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/__init__.cpython-311.pyc b/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/__init__.cpython-311.pyc
deleted file mode 100644
index fdf57d13c2b1e94a2c20321d6fcab00ee86ba913..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/__init__.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/act.cpython-311.pyc b/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/act.cpython-311.pyc
deleted file mode 100644
index 7e4a139e20f899bddacf05d467861e2857286268..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/act.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/filter.cpython-311.pyc b/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/filter.cpython-311.pyc
deleted file mode 100644
index b6416602224f887fa03f8bce27fc952f8f6ff23a..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/filter.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/resample.cpython-311.pyc b/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/resample.cpython-311.pyc
deleted file mode 100644
index af56e62e2e4bffcd9444f653101a91af4241494b..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/bigvgan/alias_free_torch/__pycache__/resample.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/bigvgan/alias_free_torch/act.py b/orator/src/orator/models/bigvgan/alias_free_torch/act.py
deleted file mode 100644
index ef231b01506f01c2b66d2dc4f3f0891219b3b41a..0000000000000000000000000000000000000000
--- a/orator/src/orator/models/bigvgan/alias_free_torch/act.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
-#   LICENSE is in incl_licenses directory.
-
-import torch.nn as nn
-
-from .resample import UpSample1d, DownSample1d
-
-
-class Activation1d(nn.Module):
-    def __init__(self,
-                 activation,
-                 up_ratio: int = 2,
-                 down_ratio: int = 2,
-                 up_kernel_size: int = 12,
-                 down_kernel_size: int = 12):
-        super().__init__()
-        self.up_ratio = up_ratio
-        self.down_ratio = down_ratio
-        self.act = activation
-        self.upsample = UpSample1d(up_ratio, up_kernel_size)
-        self.downsample = DownSample1d(down_ratio, down_kernel_size)
-
-    # x: [B, C, T]
-    def forward(self, x):
-        x = self.upsample(x)
-        x = self.act(x)
-        x = self.downsample(x)
-        return x
diff --git a/orator/src/orator/models/bigvgan/alias_free_torch/filter.py b/orator/src/orator/models/bigvgan/alias_free_torch/filter.py
deleted file mode 100644
index 066dce8eef9f31a868554f08efbef7c3f4422b7b..0000000000000000000000000000000000000000
--- a/orator/src/orator/models/bigvgan/alias_free_torch/filter.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
-#   LICENSE is in incl_licenses directory.
-
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-if 'sinc' in dir(torch):
-    sinc = torch.sinc
-else:
-    # This code is adopted from adefossez's julius.core.sinc under the MIT License
-    # https://adefossez.github.io/julius/julius/core.html
-    #   LICENSE is in incl_licenses directory.
-    def sinc(x: torch.Tensor):
-        """
-        Implementation of sinc, i.e. sin(pi * x) / (pi * x)
-        __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
-        """
-        return torch.where(x == 0,
-                           torch.tensor(1., device=x.device, dtype=x.dtype),
-                           torch.sin(math.pi * x) / math.pi / x)
-
-
-# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
-# https://adefossez.github.io/julius/julius/lowpass.html
-#   LICENSE is in incl_licenses directory.
-def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
-    even = (kernel_size % 2 == 0)
-    half_size = kernel_size // 2
-
-    #For kaiser window
-    delta_f = 4 * half_width
-    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
-    if A > 50.:
-        beta = 0.1102 * (A - 8.7)
-    elif A >= 21.:
-        beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.)
-    else:
-        beta = 0.
-    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
-
-    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
-    if even:
-        time = (torch.arange(-half_size, half_size) + 0.5)
-    else:
-        time = torch.arange(kernel_size) - half_size
-    if cutoff == 0:
-        filter_ = torch.zeros_like(time)
-    else:
-        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
-        # Normalize filter to have sum = 1, otherwise we will have a small leakage
-        # of the constant component in the input signal.
-        filter_ /= filter_.sum()
-        filter = filter_.view(1, 1, kernel_size)
-
-    return filter
-
-
-class LowPassFilter1d(nn.Module):
-    def __init__(self,
-                 cutoff=0.5,
-                 half_width=0.6,
-                 stride: int = 1,
-                 padding: bool = True,
-                 padding_mode: str = 'replicate',
-                 kernel_size: int = 12):
-        # kernel_size should be even number for stylegan3 setup,
-        # in this implementation, odd number is also possible.
-        super().__init__()
-        if cutoff < -0.:
-            raise ValueError("Minimum cutoff must be larger than zero.")
-        if cutoff > 0.5:
-            raise ValueError("A cutoff above 0.5 does not make sense.")
-        self.kernel_size = kernel_size
-        self.even = (kernel_size % 2 == 0)
-        self.pad_left = kernel_size // 2 - int(self.even)
-        self.pad_right = kernel_size // 2
-        self.stride = stride
-        self.padding = padding
-        self.padding_mode = padding_mode
-        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
-        self.register_buffer("filter", filter)
-
-    #input [B, C, T]
-    def forward(self, x):
-        _, C, _ = x.shape
-
-        if self.padding:
-            x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
-        out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
-
-        return out
diff --git a/orator/src/orator/models/bigvgan/alias_free_torch/resample.py b/orator/src/orator/models/bigvgan/alias_free_torch/resample.py
deleted file mode 100644
index 73670db9735504a51231fbe93cb812f722fb74ae..0000000000000000000000000000000000000000
--- a/orator/src/orator/models/bigvgan/alias_free_torch/resample.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
-#   LICENSE is in incl_licenses directory.
-
-import torch.nn as nn
-from torch.nn import functional as F
-
-from .filter import LowPassFilter1d
-from .filter import kaiser_sinc_filter1d
-
-
-class UpSample1d(nn.Module):
-    def __init__(self, ratio=2, kernel_size=None):
-        super().__init__()
-        self.ratio = ratio
-        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
-        self.stride = ratio
-        self.pad = self.kernel_size // ratio - 1
-        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
-        self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
-        filter = kaiser_sinc_filter1d(
-            cutoff=0.5 / ratio,
-            half_width=0.6 / ratio,
-            kernel_size=self.kernel_size
-        )
-        self.register_buffer("filter", filter)
-
-    # x: [B, C, T]
-    def forward(self, x):
-        _, C, _ = x.shape
-
-        x = F.pad(x, (self.pad, self.pad), mode='replicate')
-        x = self.ratio * F.conv_transpose1d(
-            x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C
-        )
-        x = x[..., self.pad_left:-self.pad_right]
-
-        return x
-
-
-class DownSample1d(nn.Module):
-    def __init__(self, ratio=2, kernel_size=None):
-        super().__init__()
-        self.ratio = ratio
-        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
-        self.lowpass = LowPassFilter1d(
-            cutoff=0.5 / ratio,
-            half_width=0.6 / ratio,
-            stride=ratio,
-            kernel_size=self.kernel_size
-        )
-
-    def forward(self, x):
-        xx = self.lowpass(x)
-
-        return xx
diff --git a/orator/src/orator/models/bigvgan/bigvgan.py b/orator/src/orator/models/bigvgan/bigvgan.py
deleted file mode 100644
index 356142106f6c91b0cd4c8db4ec28e04811e8e1ef..0000000000000000000000000000000000000000
--- a/orator/src/orator/models/bigvgan/bigvgan.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright (c) 2022 NVIDIA CORPORATION.
-#   Licensed under the MIT license.
-# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
-#   LICENSE is in incl_licenses directory.
-
-import logging
-from torch.nn import Conv1d, ConvTranspose1d
-from torch.nn.utils import weight_norm, remove_weight_norm
-from torch.nn.utils.weight_norm import WeightNorm
-
-from .activations import SnakeBeta
-from .alias_free_torch import *
-
-
-
-LRELU_SLOPE = 0.1
-
-logger = logging.getLogger(__name__)
-
-
-def get_padding(kernel_size, dilation=1):
-    return int((kernel_size*dilation - dilation)/2)
-
-
-def init_weights(m, mean=0.0, std=0.01):
-    classname = m.__class__.__name__
-    if classname.find("Conv") != -1:
-        m.weight.data.normal_(mean, std)
-
-
-class AMPBlock1(torch.nn.Module):
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super(AMPBlock1, self).__init__()
-
-        self.convs1 = nn.ModuleList([
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
-                               padding=get_padding(kernel_size, dilation[0]))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
-                               padding=get_padding(kernel_size, dilation[1]))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
-                               padding=get_padding(kernel_size, dilation[2])))
-        ])
-        self.convs1.apply(init_weights)
-
-        self.convs2 = nn.ModuleList([
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)))
-        ])
-        self.convs2.apply(init_weights)
-
-        self.num_layers = len(self.convs1) + len(self.convs2) # total number of conv layers
-
-        self.activations = nn.ModuleList([
-            Activation1d(activation=SnakeBeta(channels, alpha_logscale=True))
-            for _ in range(self.num_layers)
-        ])
-
-    def forward(self, x):
-        acts1, acts2 = self.activations[::2], self.activations[1::2]
-        for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
-            xt = a1(x)
-            xt = c1(xt)
-            xt = a2(xt)
-            xt = c2(xt)
-            x = xt + x
-
-        return x
-
-    def set_weight_norm(self, enabled: bool):
-        weight_norm_fn = weight_norm if enabled else remove_weight_norm
-        for l in self.convs1:
-            weight_norm_fn(l)
-        for l in self.convs2:
-            weight_norm_fn(l)
-
-
-class BigVGAN(nn.Module):
-    # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks.
-
-    # We've got a model in prod that has the wrong hparams for this. It's simpler to add this check than to
-    # redistribute the model.
-    ignore_state_dict_unexpected = ("cond_layer.*",)
-
-    def __init__(self):
-        super().__init__()
-
-        input_dims = 80
-
-        upsample_rates = [10, 8, 4, 2]
-        upsample_kernel_sizes = [x * 2 for x in upsample_rates]
-        upsample_initial_channel = 1024
-
-        resblock_kernel_sizes = [3, 7, 11]
-        resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
-        self.num_kernels = len(resblock_kernel_sizes)
-        self.num_upsamples = len(upsample_rates)
-
-        # pre conv
-        self.conv_pre = weight_norm(Conv1d(input_dims, upsample_initial_channel, 7, 1, padding=3))
-        self.cond_layer = None
-
-        # transposed conv-based upsamplers. does not apply anti-aliasing
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            self.ups.append(nn.ModuleList([
-                weight_norm(ConvTranspose1d(upsample_initial_channel // (2 ** i),
-                                            upsample_initial_channel // (2 ** (i + 1)),
-                                            k, u, padding=(k - u) // 2))
-            ]))
-
-        # residual blocks using anti-aliased multi-periodicity composition modules (AMP)
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = upsample_initial_channel // (2 ** (i + 1))
-            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
-                self.resblocks.append(AMPBlock1(ch, k, d))
-
-        # post conv
-        activation_post = SnakeBeta(ch, alpha_logscale=True)
-        self.activation_post = Activation1d(activation=activation_post)
-        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
-
-        # weight initialization
-        for i in range(len(self.ups)):
-            self.ups[i].apply(init_weights)
-        self.conv_post.apply(init_weights)
-
-    def forward(self, x) -> torch.Tensor:
-        """
-        Args
-        ----
-        x: torch.Tensor of shape [B, T, C]
-        """
-        with torch.inference_mode():
-
-            x = self.conv_pre(x)
-
-            for i in range(self.num_upsamples):
-                # upsampling
-                for i_up in range(len(self.ups[i])):
-                    x = self.ups[i][i_up](x)
-                # AMP blocks
-                xs = None
-                for j in range(self.num_kernels):
-                    if xs is None:
-                        xs = self.resblocks[i * self.num_kernels + j](x)
-                    else:
-                        xs += self.resblocks[i * self.num_kernels + j](x)
-                x = xs / self.num_kernels
-
-            # post conv
-            x = self.activation_post(x)
-            x = self.conv_post(x)
-
-            # Bound the output to [-1, 1]
-            x = torch.tanh(x)
-
-            return x
-
-    @property
-    def weight_norm_enabled(self) -> bool:
-        return any(
-            isinstance(hook, WeightNorm) and hook.name == "weight"
-            for k, hook in self.conv_pre._forward_pre_hooks.items()
-        )
-
-    def set_weight_norm(self, enabled: bool):
-        """
-        N.B.: weight norm modifies the state dict, causing incompatibilities. Conventions:
-        - BigVGAN runs with weight norm for training, without for inference (done automatically by instantiate())
-        - All checkpoints are saved with weight norm (allows resuming training)
-        """
-        if enabled != self.weight_norm_enabled:
-            weight_norm_fn = weight_norm if enabled else remove_weight_norm
-            logger.debug(f"{'Applying' if enabled else 'Removing'} weight norm...")
-
-            for l in self.ups:
-                for l_i in l:
-                    weight_norm_fn(l_i)
-            for l in self.resblocks:
-                l.set_weight_norm(enabled)
-            weight_norm_fn(self.conv_pre)
-            weight_norm_fn(self.conv_post)
-
-    def train_mode(self):
-        self.train()
-        self.set_weight_norm(enabled=True)
-
-    def inference_mode(self):
-        self.eval()
-        self.set_weight_norm(enabled=False)
-
-
-if __name__ == '__main__':
-    import sys
-    import soundfile as sf
-    model = BigVGAN()
-
-    state_dict = torch.load("bigvgan32k.pt")
-    msg = model.load_state_dict(state_dict)
-    model.eval()
-    model.set_weight_norm(enabled=False)
-
-    print(msg)
-    mels = torch.load("mels.pt")
-    with torch.inference_mode():
-        y = model(mels.cpu())
-
-    for i, wav in enumerate(y):
-        wav = wav.view(-1).detach().numpy()
-        sf.write(f"bigvgan_test{i}.flac", wav, samplerate=32_000, format="FLAC")
diff --git a/orator/src/orator/models/s3gen/__pycache__/__init__.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/__init__.cpython-311.pyc
deleted file mode 100644
index 5d342fa7f91de42eb90c2f718e96aee92e8a508b..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/__pycache__/__init__.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/__pycache__/const.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/const.cpython-311.pyc
deleted file mode 100644
index e48f54f1c8576d6a38cc373a299e3db210217574..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/__pycache__/const.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/__pycache__/decoder.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/decoder.cpython-311.pyc
deleted file mode 100644
index ec011f3ab10de9b0af8106bdc40f7d899bdb0ea2..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/__pycache__/decoder.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc
deleted file mode 100644
index 7d48cc3d1d19db9d44c7181067ab5603ec06554d..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/__pycache__/f0_predictor.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/__pycache__/flow.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/flow.cpython-311.pyc
deleted file mode 100644
index 87974e30b64cf53f11a44dbfd7e98b9e9aedfb92..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/__pycache__/flow.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/__pycache__/flow_matching.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/flow_matching.cpython-311.pyc
deleted file mode 100644
index 595dfad5532ed2b6585ed8a7c0a63ab3de713f74..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/__pycache__/flow_matching.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/__pycache__/hifigan.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/hifigan.cpython-311.pyc
deleted file mode 100644
index 2efcda58d717d84fa19895f595c60569bd871aae..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/__pycache__/hifigan.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/__pycache__/s3gen.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/s3gen.cpython-311.pyc
deleted file mode 100644
index a9c95f44491c7e486960719e2ca6e1bc81e4896d..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/__pycache__/s3gen.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/__pycache__/xvector.cpython-311.pyc b/orator/src/orator/models/s3gen/__pycache__/xvector.cpython-311.pyc
deleted file mode 100644
index 75038ede7bcb6e855476701b9b49005babe03412..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/__pycache__/xvector.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc b/orator/src/orator/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc
deleted file mode 100644
index a4720cffdda43eddaa412a82e32a28b2c4da0fd9..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/matcha/__pycache__/decoder.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc b/orator/src/orator/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc
deleted file mode 100644
index cc8c38ede378987f616ba398ba51b6856825de33..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/matcha/__pycache__/flow_matching.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc b/orator/src/orator/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc
deleted file mode 100644
index cd888d39388ea076c44e8209dba94d354d760d97..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/matcha/__pycache__/transformer.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc
deleted file mode 100644
index fb5c4d5f281b7e7a94b55d890414723c172333cd..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/__init__.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc
deleted file mode 100644
index b8388f1425fc6faddad43b710f0c58d5c374f58a..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/activation.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc
deleted file mode 100644
index cfcd257b733783a40035b0c8fbe62d0df2409c20..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/attention.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc
deleted file mode 100644
index d7cbc1355320d0b1cdcbb187e80b7b78d9802453..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/convolution.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc
deleted file mode 100644
index 619023d20d5831a83d777d768eb5876bebfff72b..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/embedding.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc
deleted file mode 100644
index 051a22fa4053a02ce51f1e6768a97acb3a8bd6f1..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/encoder_layer.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc
deleted file mode 100644
index ad9e6ed4c9925c561c6c1f7026478bffa7ab2ab2..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/positionwise_feed_forward.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc
deleted file mode 100644
index 44b67d483ae3c70b9b144f5030b68877eb075393..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/subsampling.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc b/orator/src/orator/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc
deleted file mode 100644
index fcf1cec2735b03a6b2442bcf57003ac624ea552f..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/transformer/__pycache__/upsample_encoder.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc b/orator/src/orator/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc
deleted file mode 100644
index 4b8cc4ebe7e8779f376df268307fc0fa20d4fb20..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/utils/__pycache__/class_utils.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/utils/__pycache__/mask.cpython-311.pyc b/orator/src/orator/models/s3gen/utils/__pycache__/mask.cpython-311.pyc
deleted file mode 100644
index d3bd7c9f20a0c4b75cff9d1ccc723c2839045706..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/utils/__pycache__/mask.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3gen/utils/__pycache__/mel.cpython-311.pyc b/orator/src/orator/models/s3gen/utils/__pycache__/mel.cpython-311.pyc
deleted file mode 100644
index 42e63bec8db32bb9a5349b24e752bbbfb9c288f9..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3gen/utils/__pycache__/mel.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc b/orator/src/orator/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc
deleted file mode 100644
index 21105ed2e9931207e586ab4020974f13613385f9..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3tokenizer/__pycache__/__init__.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc b/orator/src/orator/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc
deleted file mode 100644
index ec1a2cb71573c30c0b17137a662f4d22009f804d..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/s3tokenizer/__pycache__/s3tokenizer.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/t3/__pycache__/__init__.cpython-311.pyc b/orator/src/orator/models/t3/__pycache__/__init__.cpython-311.pyc
deleted file mode 100644
index 4955aa017c54594b34bd714a7ab635edc6c7f0a3..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/t3/__pycache__/__init__.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/t3/__pycache__/llama_configs.cpython-311.pyc b/orator/src/orator/models/t3/__pycache__/llama_configs.cpython-311.pyc
deleted file mode 100644
index c2e37f3930829ce356a9e01fd75a51a856624f0f..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/t3/__pycache__/llama_configs.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/t3/__pycache__/t3.cpython-311.pyc b/orator/src/orator/models/t3/__pycache__/t3.cpython-311.pyc
deleted file mode 100644
index d70491dab33ea926ac13d89f18800ca491b1fa0e..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/t3/__pycache__/t3.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc b/orator/src/orator/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc
deleted file mode 100644
index e72820205c9dc263715ccbcdef1321f7644b7d32..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc b/orator/src/orator/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc
deleted file mode 100644
index 4b99ee1fb64a8d54d165437d566e7cf64223e3cb..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/t3/modules/__pycache__/cond_enc.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc b/orator/src/orator/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc
deleted file mode 100644
index 31256b321e52e439fc42a8d4bcccad9a76eeb7c6..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/t3/modules/__pycache__/learned_pos_emb.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/t3/modules/__pycache__/perceiver.cpython-311.pyc b/orator/src/orator/models/t3/modules/__pycache__/perceiver.cpython-311.pyc
deleted file mode 100644
index 935c51d564eab2bf7d6870592d7b4020a1d8f9d5..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/t3/modules/__pycache__/perceiver.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/t3/modules/__pycache__/t3_config.cpython-311.pyc b/orator/src/orator/models/t3/modules/__pycache__/t3_config.cpython-311.pyc
deleted file mode 100644
index 31beca9bd0a70041f29c21a2145f09cb06426211..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/t3/modules/__pycache__/t3_config.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/tokenizers/__pycache__/__init__.cpython-311.pyc b/orator/src/orator/models/tokenizers/__pycache__/__init__.cpython-311.pyc
deleted file mode 100644
index fad1839ea6e3aba23d1aece28f34443d95f441c8..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/tokenizers/__pycache__/__init__.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc b/orator/src/orator/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc
deleted file mode 100644
index 22fff1b2973015e0489db75db537633e880c2c98..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/tokenizers/__pycache__/tokenizer.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/voice_encoder/__pycache__/__init__.cpython-311.pyc b/orator/src/orator/models/voice_encoder/__pycache__/__init__.cpython-311.pyc
deleted file mode 100644
index f3a8c6b767936a3466e9ede2182d17a5e87de5ff..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/voice_encoder/__pycache__/__init__.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc b/orator/src/orator/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc
deleted file mode 100644
index f7e8414b8ab2656a531547f37a18d85c88628fc1..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/models/voice_encoder/__pycache__/voice_encoder.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/transforms/__pycache__/spectrogram.cpython-311.pyc b/orator/src/orator/transforms/__pycache__/spectrogram.cpython-311.pyc
deleted file mode 100644
index b5ec09740da2846bda8787e4ae5c8d64d4d9989a..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/transforms/__pycache__/spectrogram.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/transforms/__pycache__/syn_transforms.cpython-311.pyc b/orator/src/orator/transforms/__pycache__/syn_transforms.cpython-311.pyc
deleted file mode 100644
index 2f220a91e3eec386e62a21db3d03ca6648cb527f..0000000000000000000000000000000000000000
Binary files a/orator/src/orator/transforms/__pycache__/syn_transforms.cpython-311.pyc and /dev/null differ
diff --git a/orator/src/orator/transforms/spectrogram.py b/orator/src/orator/transforms/spectrogram.py
deleted file mode 100644
index 69147fc8c591c9364ff829a157af0ea3fcbd5770..0000000000000000000000000000000000000000
--- a/orator/src/orator/transforms/spectrogram.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from functools import lru_cache
-
-from scipy import signal
-import numpy as np
-import librosa
-
-
-@lru_cache()
-def mel_basis(hp):
-    assert hp.fmax <= hp.sample_rate // 2
-    return librosa.filters.mel(
-        sr=hp.sample_rate,
-        n_fft=hp.n_fft,
-        n_mels=hp.num_mels,
-        fmin=hp.fmin,
-        fmax=hp.fmax)  # -> (nmel, nfreq)
-
-
-def preemphasis(wav, hp):
-    assert hp.preemphasis != 0
-    wav = signal.lfilter([1, -hp.preemphasis], [1], wav)
-    wav = np.clip(wav, -1, 1)
-    return wav
-
-
-def melspectrogram(wav, hp, pad=True):
-    # Run through pre-emphasis
-    if hp.preemphasis > 0:
-        wav = preemphasis(wav, hp)
-        assert np.abs(wav).max() - 1 < 1e-07
-
-    # Do the stft
-    spec_complex = _stft(wav, hp, pad=pad)
-
-    # Get the magnitudes
-    spec_magnitudes = np.abs(spec_complex)
-
-    if hp.mel_power != 1.0:
-        spec_magnitudes **= hp.mel_power
-
-    # Get the mel and convert magnitudes->db
-    mel = np.dot(mel_basis(hp), spec_magnitudes)
-    if hp.mel_type == "db":
-        mel = _amp_to_db(mel, hp)
-
-    # Normalise the mel from db to 0,1
-    if hp.normalized_mels:
-        mel = _normalize(mel, hp).astype(np.float32)
-
-    assert not pad or mel.shape[1] == 1 + len(wav) // hp.hop_size   # Sanity check
-    return mel   # (M, T)
-
-
-def _stft(y, hp, pad=True):
-    # NOTE: after 0.8, pad mode defaults to constant, setting this to reflect for
-    #   historical consistency and streaming-version consistency
-    return librosa.stft(
-        y,
-        n_fft=hp.n_fft,
-        hop_length=hp.hop_size,
-        win_length=hp.win_size,
-        center=pad,
-        pad_mode="reflect",
-    )
-
-
-def _amp_to_db(x, hp):
-    return 20 * np.log10(np.maximum(hp.stft_magnitude_min, x))
-
-
-def _db_to_amp(x):
-    return np.power(10.0, x * 0.05)
-
-
-def _normalize(s, hp, headroom_db=15):
-    min_level_db = 20 * np.log10(hp.stft_magnitude_min)
-    s = (s - min_level_db) / (-min_level_db + headroom_db)
-    return s
diff --git a/orator/src/orator/transforms/syn_transforms.py b/orator/src/orator/transforms/syn_transforms.py
deleted file mode 100644
index 13ce597ae05503ef618b2de9b6c7b833f94409cb..0000000000000000000000000000000000000000
--- a/orator/src/orator/transforms/syn_transforms.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Common transformations used by synthesizers
-import logging
-
-import numpy as np
-import torch
-
-
-logger = logging.getLogger(__name__)
-
-
-def pack(arrays, seq_len: int=None, pad_value=0):
-    """
-    Given a list of length B of array-like objects of shapes (Ti, ...), packs them in a single tensor of
-    shape (B, T, ...) by padding each individual array on the right.
-
-    :param arrays: a list of array-like objects of matching shapes except for the first axis.
-    :param seq_len: the value of T. It must be the maximum of the lengths Ti of the arrays at
-    minimum. Will default to that value if None.
-    :param pad_value: the value to pad the arrays with.
-    :return: a (B, T, ...) tensor
-    """
-    if seq_len is None:
-        seq_len = max(len(array) for array in arrays)
-    else:
-        assert seq_len >= max(len(array) for array in arrays)
-
-    # Convert lists to np.array
-    if isinstance(arrays[0], list):
-        arrays = [np.array(array) for array in arrays]
-
-    # Convert to tensor and handle device
-    device = None
-    if isinstance(arrays[0], torch.Tensor):
-        tensors = arrays
-        device = tensors[0].device
-    else:
-        tensors = [torch.as_tensor(array) for array in arrays]
-
-    # Fill the packed tensor with the array data
-    packed_shape = (len(tensors), seq_len, *tensors[0].shape[1:])
-    packed_tensor = torch.full(packed_shape, pad_value, dtype=tensors[0].dtype, device=device)
-
-    for i, tensor in enumerate(tensors):
-        packed_tensor[i, :tensor.size(0)] = tensor
-
-    return packed_tensor
diff --git a/orator/src/orator/transforms/webrtc.py b/orator/src/orator/transforms/webrtc.py
deleted file mode 100644
index c3d3abf97f27ac7f5c51ea7228920a0c522ed934..0000000000000000000000000000000000000000
--- a/orator/src/orator/transforms/webrtc.py
+++ /dev/null
@@ -1,181 +0,0 @@
-from itertools import groupby
-
-import numpy as np
-import webrtcvad as _webrtcvad
-
-from transforms.vad.vad_stream import VADStream
-from transforms.wav_encoding import encode_pcm16
-
-# The sample rate the algo can operate at
-_WEBRTC_SAMPLE_RATES = np.array([8000, 16000, 32000, 48000])
-# The algo operates with window sizes of 10, 20 and 30ms
-_WEBRTC_WINDOW_SIZES_MS = (10, 20, 30)
-# Greatest common divisor and lowest common multiple of the above
-_WEBRTC_WINDOW_SIZES_MS_GCD = 10
-_WEBRTC_WINDOW_SIZES_MS_LCM = 60
-
-
-class WebRTCVADStream(VADStream):
-    def __init__(self, sample_rate: int, aggressiveness=2, dilation_ms=40, min_voiced_region_ms=125):
-        """
-        :param sample_rate: sample rate of the wavs that will be passed
-        :param aggressiveness: parameter for controlling the aggressiveness of the VAD algo. Possible values are 1,
-        2 and 3. Higher means less regions will be detected as voiced.
-        :param dilation_ms: pass a value greater than 0 to include regions directly preceding or succeeding voiced
-        regions. Voiced regions will be expanded left and right by this value, in milliseconds.
-        N.B.: this is a best effort parameter. When the output is requested as fast as the input is produced,
-        it's impossible to foresee an upcoming voiced region. In that case, the dilation on the left of that region
-        may not appear.
-        :param min_voiced_region_ms: to exclude regions detected as speech that are considered too short, pass a value
-        greater than 0. Voiced regions shorter than this value (prior to dilation) will be set as unvoiced.
-        N.B.: this is also a best effort parameter. A region may be too short, but because VAD has not finished
-        being computed at the end of that region, it won't be removed as it could potentially be large enough.
-        """
-        webrtc_sr = int(_WEBRTC_SAMPLE_RATES[np.argmin(np.abs(_WEBRTC_SAMPLE_RATES - sample_rate))])
-        lcm_win_size = (_WEBRTC_WINDOW_SIZES_MS_LCM * webrtc_sr) // 1000
-        self._gcd_win_size = (_WEBRTC_WINDOW_SIZES_MS_GCD * webrtc_sr) // 1000
-
-        # webrtcvad.Vad is stateful, predictions will be impacted if a new instance is created halfway through an
-        # audio. This is why we create them now.
-        self._detectors = {win_size: _webrtcvad.Vad(mode=aggressiveness) for win_size in _WEBRTC_WINDOW_SIZES_MS}
-
-        super().__init__(sample_rate, webrtc_sr, lcm_win_size, dilation_ms, min_voiced_region_ms)
-
-    def _wav_vad(self, wav: np.ndarray) -> np.ndarray:
-        pcm = encode_pcm16(wav)
-
-        # Perform the VAD by ensembling the different window sizes
-        win_vad = np.zeros(len(wav) // self._gcd_win_size, dtype=np.int32)
-        for sub_win_size_ms in _WEBRTC_WINDOW_SIZES_MS:
-            detector = self._detectors[sub_win_size_ms]
-            sub_win_size_pcm = (2 * sub_win_size_ms * self.vad_sr) // 1000
-            factor = sub_win_size_ms // _WEBRTC_WINDOW_SIZES_MS_GCD
-
-            for i, win_start in enumerate(range(0, len(pcm), sub_win_size_pcm)):
-                win_i_vad = detector.is_speech(pcm[win_start:win_start + sub_win_size_pcm], self.vad_sr)
-                win_vad[i * factor:(i + 1) * factor] += win_i_vad
-        win_vad = win_vad > (len(_WEBRTC_WINDOW_SIZES_MS) // 2)
-
-        # Convert the output to regions
-        regions = np.diff(win_vad, prepend=0, append=0).nonzero()[0].reshape(-1, 2)
-        regions = regions * (len(wav) // len(win_vad))
-
-        return regions
-
-
-def webrtc_vad(wav: np.ndarray, source_sr: int, aggressiveness=2, dilation_ms=40, min_voiced_region_ms=125):
-    """
-    Peforms Voice Activation Detection on a single audio. See WebrtcVADStream for more details.
-
-    :return vad: a boolean numpy array of length equal to <wav>
-    """
-    vad_stream = WebRTCVADStream(source_sr, aggressiveness, dilation_ms, min_voiced_region_ms)
-    vad_stream.feed(wav, close_input=True)
-    if vad_stream.can_step():
-        return vad_stream.step(len(wav))
-    else:
-        return np.zeros_like(wav, dtype=bool)
-
-
-def split_on_silence(
-    wav, sr, vad, thresholds_ms=[500, 300, 200, 100, 50], min_dur_s=1.5, max_split_dur_s=20, max_dur_s=30,
-):
-    """
-    Split a wav into chunks, splitting on silence when the length of the silence exceeds a threshold.
-    Args:
-        wav: 1d-array
-        sr: sample rate
-        thresholds_ms: min length of silence to split on, clips are recursively split using values from this list until
-            the resulting chunks are all within the min / max duration bounds
-        min_dur_s: minimum duration of a chunk in seconds
-        max_split_dur_s: segments above this length are continue to be split down with smaller thesholds
-        max_dur_s: maximum duration of a chunk in seconds
-    """
-    assert isinstance(wav, np.ndarray) and wav.ndim == 1
-
-    # unpack silence length thresholds
-    thresh_ms, next_thresh_ms = (thresholds_ms + [0, 0])[:2]
-    if thresh_ms <= 0:
-        return [wav]
-
-    # convert thresholds to samples
-    max_split_dur_s = min(max_split_dur_s, max_dur_s)
-    thresh = int(thresh_ms * sr / 1000)
-    min_len = int(min_dur_s * sr)
-    max_split_len = int(max_split_dur_s * sr)
-    max_len = int(max_dur_s * sr)
-    wav_len = len(wav)
-
-    # detect regions of silence using groupby
-    sil_regions = []
-    for is_voiced, idxs in groupby(range(wav_len), key=vad.__getitem__):
-        idxs = list(idxs)
-        i = idxs[0]
-        j = idxs[-1]
-        j += 1
-        n = j - i
-        mid = (i + j) // 2
-
-        # record split point if this is a long silence region
-        if (not is_voiced) and n > thresh:
-            sil_regions += [(
-                min(mid, i + (0 if i == 0 else thresh // 2)),
-                max(mid, j - (0 if j == wav_len else thresh // 2)),
-            )]
-
-    # invert silence regions to get voiced regions
-    ptr = 0
-    voiced_regions = []
-    for i, j in sil_regions:
-        if i > 0:
-            voiced_regions += [(ptr, i)]
-        ptr = j
-    if ptr < wav_len:
-        voiced_regions += [(ptr, wav_len)]
-
-    # split the waveform into chunks using the detected content bounds and silence split points
-    chunks = []
-    for i, j in voiced_regions:
-        chunk = wav[i:j]
-        chunklen = len(chunk)
-
-        # chunk is within bounds
-        if chunklen < max_split_len:
-            chunks += [chunk]
-
-        # chunk is too long, attempt to split it recursively using threshold list
-        elif next_thresh_ms > 0:
-            chunks += split_on_silence(
-                chunk, sr, vad[i:j], thresholds_ms=thresholds_ms[1:],
-                min_dur_s=min_dur_s, max_dur_s=max_dur_s,
-            )
-
-        # NOTE: keeping chunks longer than `max_len` here, filtering is done below
-        else:
-            chunks += [chunk]
-
-    # merge short chunks
-    merged_chunks = []
-    for chunk in chunks:
-        chunklen = len(chunk)
-
-        # chunk is too short, add it to the previous chunk if possible
-        if chunklen == 0:
-            continue
-
-        elif chunklen < min_len:
-            # NOTE: ignore the edge case where this would make the previous chunk too long, by just dropping this chunk
-            if len(merged_chunks) > 0 and len(merged_chunks[-1]) + chunklen < max_len:
-                merged_chunks[-1] = np.concatenate([merged_chunks[-1], chunk])
-
-        elif chunklen < max_len:
-            merged_chunks += [chunk]
-
-        else:
-            # TODO: keep long chunks as well? one benefit is to keep the adjascent ordering of chunks, for
-            #   building paragraph-level datasets. However, this should rarely drop any clips, so it's probably okay.
-            # merged_chunks += [chunk]
-            pass
-    chunks = merged_chunks
-
-    return chunks