Spaces:

chenxie95
/

MeanAudio

Running on Zero

App Files Files Community

junxiliu commited on 28 days ago

Commit

3a1da90

1 Parent(s): 6ec9214

add needed model with proper LFS tracking

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
LICENSE +21 -0
MeanAudio +0 -1
config/__init__.py +0 -0
config/base_config.yaml +65 -0
config/data/t5_clap.yaml +58 -0
config/eval_config.yaml +23 -0
config/hydra/job_logging/custom-eval.yaml +32 -0
config/hydra/job_logging/custom-no-rank.yaml +32 -0
config/hydra/job_logging/custom-simplest.yaml +26 -0
config/hydra/job_logging/custom.yaml +33 -0
config/train_config.yaml +46 -0
data/.gitkeep +0 -0
eval.py +151 -0
infer.py +143 -0
meanaudio/__init__.py +0 -0
meanaudio/data/__init__.py +0 -0
meanaudio/data/av_utils.py +162 -0
meanaudio/data/data_setup.py +137 -0
meanaudio/data/eval/__init__.py +0 -0
meanaudio/data/eval/audiocaps.py +39 -0
meanaudio/data/eval/moviegen.py +131 -0
meanaudio/data/eval/video_dataset.py +197 -0
meanaudio/data/extracted_audio.py +175 -0
meanaudio/data/extraction/__init__.py +0 -0
meanaudio/data/extraction/vgg_sound.py +195 -0
meanaudio/data/extraction/wav_dataset.py +153 -0
meanaudio/data/mm_dataset.py +50 -0
meanaudio/data/utils.py +148 -0
meanaudio/eval_utils.py +167 -0
meanaudio/ext/__init__.py +1 -0
meanaudio/ext/autoencoder/__init__.py +1 -0
meanaudio/ext/autoencoder/autoencoder.py +52 -0
meanaudio/ext/autoencoder/edm2_utils.py +168 -0
meanaudio/ext/autoencoder/vae.py +369 -0
meanaudio/ext/autoencoder/vae_modules.py +117 -0
meanaudio/ext/bigvgan/LICENSE +21 -0
meanaudio/ext/bigvgan/__init__.py +1 -0
meanaudio/ext/bigvgan/activations.py +120 -0
meanaudio/ext/bigvgan/alias_free_torch/__init__.py +6 -0
meanaudio/ext/bigvgan/alias_free_torch/act.py +28 -0
meanaudio/ext/bigvgan/alias_free_torch/filter.py +95 -0
meanaudio/ext/bigvgan/alias_free_torch/resample.py +49 -0
meanaudio/ext/bigvgan/bigvgan.py +32 -0
meanaudio/ext/bigvgan/bigvgan_vocoder.yml +63 -0
meanaudio/ext/bigvgan/env.py +18 -0
meanaudio/ext/bigvgan/incl_licenses/LICENSE_1 +21 -0
meanaudio/ext/bigvgan/incl_licenses/LICENSE_2 +21 -0
meanaudio/ext/bigvgan/incl_licenses/LICENSE_3 +201 -0
meanaudio/ext/bigvgan/incl_licenses/LICENSE_4 +29 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Sony Research Inc.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

MeanAudio DELETED Viewed

	@@ -1 +0,0 @@
1	- Subproject commit 5f221b4b30ba3f89e8711c54961461c48d4999b8

config/__init__.py ADDED Viewed

File without changes

config/base_config.yaml ADDED Viewed

	@@ -0,0 +1,65 @@

+defaults:
+  - data: t5_clap  # chenge here to load different data in testing (data.AudioCaps_test)
+  - override hydra/job_logging: custom-simplest
+  - _self_
+hydra:
+  run:
+    dir: ./exps/${exp_id}
+  output_subdir: ${now:%Y-%m-%d_%H-%M-%S}-hydra
+enable_email: False
+## model
+model: meanaudio_mf
+text_encoder_name: t5_clap  # [t5, clip, t5_clap, t5_clap_cat]: change here for different feature utils (only for runner-FeatureUtils/infer, not used for using pre-computed dataset)
+concat_text_fc: False
+exp_id: default
+debug: False
+cudnn_benchmark: True
+compile: False  # set compile to false by default
+amp: True
+weights: null
+# weights: null
+checkpoint: null
+seed: 14159265
+num_workers: 10 # per-GPU
+pin_memory: False # set to True if your system can handle it, i.e., have enough memory
+# NOTE: This DOSE NOT affect the model during inference in any way
+# they are just for the dataloader to fill in the missing data in multi-modal loading
+# to change the sequence length for the model, see networks.py
+data_dim:
+  text_seq_len: 77
+  text_dim: 1024
+  text_c_dim: 512  # 1024 for pooled T5, 512 for CLAP
+# ema configuration
+ema:
+  enable: True
+  sigma_rels: [0.05, 0.1]
+  update_every: 1
+  checkpoint_every: 10_000
+  checkpoint_folder: ${hydra:run.dir}/ema_ckpts
+  default_output_sigma: 0.05
+# sampling, only for flow matching
+sampling:
+  mean: 0.0
+  scale: 1.0
+  min_sigma: 0.0
+  method: euler
+  num_steps: 25
+# classifier-free guidance
+null_condition_probability: 0.1
+cfg_strength: 1
+# checkpoint paths to external modules
+vae_16k_ckpt: ./weights/v1-16.pth
+vae_44k_ckpt: ./weights/v1-44.pth
+bigvgan_vocoder_ckpt: ./weights/best_netG.pt

config/data/t5_clap.yaml ADDED Viewed

	@@ -0,0 +1,58 @@

+# AudioCaps
+AudioCaps_npz:
+  tag: train
+  tsv: data/audiocaps/train-memmap.tsv
+  npz_dir: data/audiocaps/train-npz-t5-clap
+  output_subdir: null
+  repa_npz_dir: null
+AudioCaps_val_npz:
+  tag: val
+  tsv: data/audiocaps/val-memmap.tsv
+  npz_dir: data/audiocaps/val-npz-t5-clap
+  output_subdir: null
+  repa_npz_dir: null
+  gt_cache: data/audiocaps/val-features
+AudioCaps_test_npz:
+  tag: test
+  tsv: data/audiocaps/test-memmap.tsv
+  npz_dir: data/audiocaps/test-npz-t5-clap
+  output_subdir: null
+  repa_npz_dir: null
+  gt_cache: data/audiocaps/test-features
+latent_mean: 'sets/latent_mean.pt'
+latent_std: 'sets/latent_std.pt'
+# Clotho
+Clotho_npz:
+  tsv: /hpc_stor03/sjtu_home/xiquan.li/data/MMAudio/clotho/dev-memmap-t5-clap.tsv
+  npz_dir: /hpc_stor03/sjtu_home/xiquan.li/data/MMAudio/clotho/dev-npz-t5-clap
+  repa_npz_dir: null
+# WavCaps
+AudioSetSL_npz:
+  tsv: /hpc_stor03/sjtu_home/xiquan.li/data/MMAudio/wavcaps/audioset-sl-memmap-t5-clap.tsv
+  npz_dir: /hpc_stor03/sjtu_home/xiquan.li/data/MMAudio/wavcaps/audioset-sl-npz-t5-clap
+  repa_npz_dir: null
+BBCSound_npz:
+  tsv: /hpc_stor03/sjtu_home/xiquan.li/data/MMAudio/wavcaps/bbc-sound-effects-memmap-t5-clap.tsv
+  npz_dir: /hpc_stor03/sjtu_home/xiquan.li/data/MMAudio/wavcaps/bbc-sound-effects-npz-t5-clap
+  repa_npz_dir: null
+FreeSound1_npz:
+  tsv: /hpc_stor03/sjtu_home/junxi.liu/shared/freesound-memmap-t5-clap-1.tsv
+  npz_dir: /hpc_stor03/sjtu_home/junxi.liu/shared/freesound-npz-t5-clap-1
+  repa_npz_dir: null
+FreeSound2_npz:
+  tsv: /hpc_stor03/sjtu_home/junxi.liu/shared/freesound-memmap-t5-clap-2.tsv
+  npz_dir: /hpc_stor03/sjtu_home/junxi.liu/shared/freesound-npz-t5-clap-2
+  repa_npz_dir: null
+FreeSound3_npz:
+  tsv: /hpc_stor03/sjtu_home/junxi.liu/shared/freesound-memmap-t5-clap-3.tsv
+  npz_dir: /hpc_stor03/sjtu_home/junxi.liu/shared/freesound-npz-t5-clap-3
+  repa_npz_dir: null

config/eval_config.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+## This config fire is no longer used
+## We pass everything by train_config to ensure training/eval consistency
+defaults:
+  - base_config_at
+  - override hydra/job_logging: custom-simplest
+  - _self_
+hydra:
+  run:
+    dir: ./exps/${exp_id}
+  output_subdir: eval-${now:%Y-%m-%d_%H-%M-%S}-hydra
+exp_id: ${model}
+dataset: audiocaps
+duration_s: 10.0
+# for inference, this is the per-GPU batch size
+batch_size: 16  # eval batch size
+output_name: null
+enable_grad_scaler: False

config/hydra/job_logging/custom-eval.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+# python logging configuration for tasks
+version: 1
+formatters:
+  simple:
+    format: '[%(asctime)s][%(levelname)s][r${oc.env:LOCAL_RANK}] - %(message)s'
+    datefmt: '%Y-%m-%d %H:%M:%S'
+  colorlog:
+    '()': 'colorlog.ColoredFormatter'
+    format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
+    datefmt: '%Y-%m-%d %H:%M:%S'
+    log_colors:
+      DEBUG: purple
+      INFO: green
+      WARNING: yellow
+      ERROR: red
+      CRITICAL: red
+handlers:
+  console:
+    class: logging.StreamHandler
+    formatter: colorlog
+    stream: ext://sys.stdout
+  file:
+    class: logging.FileHandler
+    formatter: simple
+    # absolute file path
+    filename: ${hydra.runtime.output_dir}/eval-${now:%Y-%m-%d_%H-%M-%S}-rank${oc.env:LOCAL_RANK}.log
+    mode: w
+root:
+  level: INFO
+  handlers: [console, file]
+disable_existing_loggers: false

config/hydra/job_logging/custom-no-rank.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+# python logging configuration for tasks
+version: 1
+formatters:
+  simple:
+    format: '[%(asctime)s][%(levelname)s] - %(message)s'
+    datefmt: '%Y-%m-%d %H:%M:%S'
+  colorlog:
+    '()': 'colorlog.ColoredFormatter'
+    format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
+    datefmt: '%Y-%m-%d %H:%M:%S'
+    log_colors:
+      DEBUG: purple
+      INFO: green
+      WARNING: yellow
+      ERROR: red
+      CRITICAL: red
+handlers:
+  console:
+    class: logging.StreamHandler
+    formatter: colorlog
+    stream: ext://sys.stdout
+  file:
+    class: logging.FileHandler
+    formatter: simple
+    # absolute file path
+    filename: ${hydra.runtime.output_dir}/${now:%Y-%m-%d_%H-%M-%S}-eval.log
+    mode: w
+root:
+  level: INFO
+  handlers: [console, file]
+disable_existing_loggers: false

config/hydra/job_logging/custom-simplest.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+# python logging configuration for tasks
+version: 1
+formatters:
+  simple:
+    format: '[%(asctime)s][%(levelname)s] - %(message)s'
+    datefmt: '%Y-%m-%d %H:%M:%S'
+  colorlog:
+    '()': 'colorlog.ColoredFormatter'
+    format: '[%(cyan)s%(asctime)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
+    datefmt: '%Y-%m-%d %H:%M:%S'
+    log_colors:
+      DEBUG: purple
+      INFO: green
+      WARNING: yellow
+      ERROR: red
+      CRITICAL: red
+handlers:
+  console:
+    class: logging.StreamHandler
+    formatter: colorlog
+    stream: ext://sys.stdout
+root:
+  level: INFO
+  handlers: [console]
+disable_existing_loggers: false

config/hydra/job_logging/custom.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+# @package hydra.job_logging
+# python logging configuration for tasks
+version: 1
+formatters:
+  simple:
+    format: '[%(asctime)s][%(levelname)s][r${oc.env:LOCAL_RANK}] - %(message)s'
+    datefmt: '%Y-%m-%d %H:%M:%S'
+  colorlog:
+    '()': 'colorlog.ColoredFormatter'
+    format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)sr${oc.env:LOCAL_RANK}%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s'
+    datefmt: '%Y-%m-%d %H:%M:%S'
+    log_colors:
+      DEBUG: purple
+      INFO: green
+      WARNING: yellow
+      ERROR: red
+      CRITICAL: red
+handlers:
+  console:
+    class: logging.StreamHandler
+    formatter: colorlog
+    stream: ext://sys.stdout
+  file:
+    class: logging.FileHandler
+    formatter: simple
+    # absolute file path
+    filename: ${hydra.runtime.output_dir}/train-${now:%Y-%m-%d_%H-%M-%S}-rank${oc.env:LOCAL_RANK}.log
+    mode: w
+root:
+  level: INFO
+  handlers: [console, file]
+disable_existing_loggers: false

config/train_config.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+defaults:
+  - base_config
+  - override data: t5_clap    # change here for loading different text features in training/evaluation
+  - override hydra/job_logging: custom
+  - _self_
+hydra:
+  run:
+    dir: ./exps/${exp_id}
+  output_subdir: train-${now:%Y-%m-%d_%H-%M-%S}-hydra
+ema:
+  start: 0
+mini_train: False
+example_train: False
+enable_grad_scaler: True
+ac_oversample_rate: 5
+log_text_interval: 50
+log_extra_interval: 10_000
+val_interval: 10_000
+eval_interval: 10_000
+save_eval_interval: 10_000
+save_weights_interval: 5_000
+save_checkpoint_interval: 10_000
+save_copy_iterations: []
+batch_size: 128
+eval_batch_size: 4
+num_iterations: 100_000
+learning_rate: 1e-4
+linear_warmup_steps: 1_000
+lr_schedule: step
+lr_schedule_steps: [40_000, 45_000]  # this is not used, lr_schedule_steps will be determined by the number of iterations
+lr_schedule_gamma: 0.1
+clip_grad_norm: 1.0
+weight_decay: 1.0e-6
+output_name: null   # for eval
+use_meanflow: True
+use_repa: False

data/.gitkeep ADDED Viewed

File without changes

eval.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import logging
+from argparse import ArgumentParser
+from pathlib import Path
+import os
+import torch
+import torchaudio
+import csv
+from meanaudio.eval_utils import (ModelConfig, all_model_cfg, generate_fm, generate_mf, setup_eval_logging)
+from meanaudio.model.flow_matching import FlowMatching
+from meanaudio.model.mean_flow import MeanFlow
+from meanaudio.model.networks import MeanAudio, get_mean_audio
+from meanaudio.model.utils.features_utils import FeaturesUtils
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+from tqdm import tqdm
+log = logging.getLogger()
+@torch.inference_mode()
+def main():
+    setup_eval_logging()
+    parser = ArgumentParser()
+    parser.add_argument('--variant',
+                        type=str,
+                        default='meanaudio_mf',
+                        help='meanaudio_mf, fluxaudio_fm')
+    parser.add_argument('--audio_path', type=str, help='Input audio', default='')
+    parser.add_argument('--duration', type=float, default=9.975)  # for 312 latents, seq_config should has a duration of 9.975s
+    parser.add_argument('--cfg_strength', type=float, default=4.5,
+                        help='If you use meanflow, CFG is integrated in model training. So simply set this <1 to avoid an additional unconditional infer.')
+    parser.add_argument('--num_steps', type=int, default=25)
+    parser.add_argument('--output', type=Path, help='Output directory', default='./output')
+    parser.add_argument('--seed', type=int, help='Random seed', default=42)
+    parser.add_argument('--full_precision', action='store_true')
+    parser.add_argument('--model_path', type=str, help='Ckpt path of trained model')
+    parser.add_argument('--encoder_name', choices=['clip', 't5', 't5_clap'], type=str, help='text encoder name')
+    parser.add_argument('--use_rope', action='store_true', help='Whether or not use position embedding for model')
+    parser.add_argument('--text_c_dim', type=int, default=512,
+                        help='Dim of the text_features_c, 1024 for pooled T5 and 512 for CLAP')
+    parser.add_argument('--debug', action='store_true')
+    parser.add_argument('--use_meanflow', action='store_true', help='Whether or not use mean flow for inference')
+    args = parser.parse_args()
+    if args.debug:
+        import debugpy
+        debugpy.listen(6665)
+        print("Waiting for debugger attach (rank 0)...")
+        debugpy.wait_for_client()
+    if args.variant not in all_model_cfg:
+        raise ValueError(f'Unknown model variant: {args.variant}')
+    model: ModelConfig = all_model_cfg[args.variant]  # model is just the model config
+    # model.download_if_needed()
+    seq_cfg = model.seq_cfg
+    negative_prompt: str = ''
+    output_dir: str = args.output.expanduser()
+    seed: int = args.seed
+    num_steps: int = args.num_steps
+    duration: float = args.duration
+    cfg_strength: float = args.cfg_strength
+    device = 'cpu'
+    if torch.cuda.is_available():
+        device = 'cuda'
+    elif torch.backends.mps.is_available():
+        device = 'mps'
+    else:
+        log.warning('CUDA/MPS are not available, running on CPU')
+    dtype = torch.float32 if args.full_precision else torch.bfloat16
+    output_dir.mkdir(parents=True, exist_ok=True)
+    print(model.model_name)
+    # load a pretrained model
+    net: MeanAudio = get_mean_audio(model.model_name,
+                                    use_rope=args.use_rope,
+                                    text_c_dim=args.text_c_dim).to(device, dtype).eval()
+    net.load_weights(torch.load(args.model_path, map_location=device, weights_only=True))
+    log.info(f'Loaded weights from {args.model_path}')
+    # misc setup
+    rng = torch.Generator(device=device)
+    rng.manual_seed(seed)
+    if args.use_meanflow:
+        mf = MeanFlow(steps=num_steps)
+    else:
+        fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
+    feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
+                                    enable_conditions=True,
+                                    encoder_name=args.encoder_name,
+                                    mode=model.mode,
+                                    bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
+                                    need_vae_encoder=False)
+    feature_utils = feature_utils.to(device, dtype).eval()
+    seq_cfg.duration = duration
+    net.update_seq_lengths(seq_cfg.latent_seq_len)
+    eval_file =  './sets/test-audiocaps.tsv'
+    audio_ids=[]
+    text_prompts=[]
+    with open(eval_file, 'r') as f:
+            reader = csv.DictReader(f, delimiter='\t')
+            for row in reader:
+                audio_ids.append(row['id'])
+                text_prompts.append(row['caption'])
+    for k in tqdm(range(0, len(text_prompts))):
+        prompt = text_prompts[k]
+        if args.use_meanflow:
+            log.info(f'Prompt: {prompt}')
+            log.info(f'Negative prompt: {negative_prompt}')
+            audios = generate_mf([prompt],
+                                negative_text=[negative_prompt],
+                                feature_utils=feature_utils,
+                                net=net,
+                                mf=mf,
+                                rng=rng,
+                                cfg_strength=cfg_strength)
+            audio = audios.float().cpu()[0]
+            save_paths = output_dir / f'{audio_ids[k]}.wav'
+            torchaudio.save(save_paths, audio, seq_cfg.sampling_rate)
+            log.info(f'Audio saved to {save_paths}')
+            log.info('Memory usage: %.2f GB', torch.cuda.max_memory_allocated() / (2**30))
+        else:
+            prompt = text_prompts[k]
+            log.info(f'Prompt: {prompt}')
+            log.info(f'Negative prompt: {negative_prompt}')
+            audios = generate_fm([prompt],
+                                negative_text=[negative_prompt],
+                                feature_utils=feature_utils,
+                                net=net,
+                                fm=fm,
+                                rng=rng,
+                                cfg_strength=cfg_strength)
+            audio = audios.float().cpu()[0]
+            save_paths = output_dir / f'{audio_ids[k]}.wav'
+            torchaudio.save(save_paths, audio, seq_cfg.sampling_rate)
+            log.info(f'Audio saved to {save_paths}')
+            log.info('Memory usage: %.2f GB', torch.cuda.max_memory_allocated() / (2**30))
+if __name__ == '__main__':
+    main()

infer.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import warnings
+warnings.filterwarnings("ignore", category=FutureWarning)
+import logging
+from argparse import ArgumentParser
+from pathlib import Path
+import torch
+import torchaudio
+from meanaudio.eval_utils import (ModelConfig, all_model_cfg, generate_mf, generate_fm, setup_eval_logging)
+from meanaudio.model.flow_matching import FlowMatching
+from meanaudio.model.mean_flow import MeanFlow
+from meanaudio.model.networks import MeanAudio, get_mean_audio
+from meanaudio.model.utils.features_utils import FeaturesUtils
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+from tqdm import tqdm
+log = logging.getLogger()
+@torch.inference_mode()
+def main():
+    setup_eval_logging()
+    parser = ArgumentParser()
+    parser.add_argument('--variant',
+                        type=str,
+                        default='small_16k_mf',
+                        help='small_16k_mf, small_16k_fm')
+    parser.add_argument('--prompt', type=str, help='Input prompt', default='')
+    parser.add_argument('--negative_prompt', type=str, help='Negative prompt', default='')
+    parser.add_argument('--duration', type=float, default=9.975)  # for 312 latents, seq_config should has a duration of 9.975s
+    parser.add_argument('--cfg_strength', type=float, default=4.5)
+    parser.add_argument('--num_steps', type=int, default=25)
+    parser.add_argument('--output', type=Path, help='Output directory', default='./output')
+    parser.add_argument('--seed', type=int, help='Random seed', default=42)
+    parser.add_argument('--full_precision', action='store_true')
+    parser.add_argument('--model_path', type=str, help='Ckpt path of trained model')
+    parser.add_argument('--encoder_name', choices=['clip', 't5', 't5_clap'], type=str, help='text encoder name')
+    parser.add_argument('--use_rope', action='store_true', help='Whether or not use position embedding for model')
+    parser.add_argument('--text_c_dim', type=int, default=512,
+                        help='Dim of the text_features_c, 1024 for pooled T5 and 512 for CLAP')
+    parser.add_argument('--debug', action='store_true')
+    parser.add_argument('--use_meanflow', action='store_true', help='Whether or not use mean flow for inference')
+    args = parser.parse_args()
+    if args.debug:
+        import debugpy
+        debugpy.listen(6666)
+        print("Waiting for debugger attach (rank 0)...")
+        debugpy.wait_for_client()
+    if args.variant not in all_model_cfg:
+        raise ValueError(f'Unknown model variant: {args.variant}')
+    model: ModelConfig = all_model_cfg[args.variant]  # model is just the model config
+    seq_cfg = model.seq_cfg
+    negative_prompt: str = args.negative_prompt
+    output_dir: str = args.output.expanduser()
+    seed: int = args.seed
+    num_steps: int = args.num_steps
+    duration: float = args.duration
+    cfg_strength: float = args.cfg_strength
+    device = 'cpu'
+    if torch.cuda.is_available():
+        device = 'cuda'
+    elif torch.backends.mps.is_available():
+        device = 'mps'
+    else:
+        log.warning('CUDA/MPS are not available, running on CPU')
+    dtype = torch.float32 if args.full_precision else torch.bfloat16
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # load a pretrained model
+    net: MeanAudio = get_mean_audio(model.model_name,
+                                    use_rope=args.use_rope,
+                                    text_c_dim=args.text_c_dim).to(device, dtype).eval()
+    net.load_weights(torch.load(args.model_path, map_location=device, weights_only=True))
+    log.info(f'Loaded weights from {args.model_path}')
+    # misc setup
+    rng = torch.Generator(device=device)
+    rng.manual_seed(seed)
+    if args.use_meanflow:
+        mf = MeanFlow(steps=num_steps)
+    else:
+        fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
+    feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
+                                  enable_conditions=True,
+                                  encoder_name=args.encoder_name,
+                                  mode=model.mode,
+                                  bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
+                                  need_vae_encoder=False)
+    feature_utils = feature_utils.to(device, dtype).eval()
+    seq_cfg.duration = duration
+    net.update_seq_lengths(seq_cfg.latent_seq_len)
+    prompts: str = [args.prompt]
+    if args.use_meanflow:
+        for prompt in tqdm(prompts):
+            log.info(f'Prompt: {prompt}')
+            log.info(f'Negative prompt: {negative_prompt}')
+            audios = generate_mf([prompt],
+                                  negative_text=[negative_prompt],
+                                  feature_utils=feature_utils,
+                                  net=net,
+                                  mf=mf,
+                                  rng=rng,
+                                  cfg_strength=cfg_strength)
+            audio = audios.float().cpu()[0]
+            safe_filename = prompt.replace(' ', '_').replace('/', '_').replace('.', '')
+            save_path = output_dir / f'{safe_filename}--numsteps{num_steps}--seed{args.seed}.wav'
+            torchaudio.save( save_path, audio, seq_cfg.sampling_rate)
+            log.info(f'Audio saved to {save_path}')
+        log.info('Memory usage: %.2f GB', torch.cuda.max_memory_allocated() / (2**30))
+    else:
+        for prompt in tqdm(prompts):
+            log.info(f'Prompt: {prompt}')
+            log.info(f'Negative prompt: {negative_prompt}')
+            audios = generate_fm([prompt],
+                                  negative_text=[negative_prompt],
+                                  feature_utils=feature_utils,
+                                  net=net,
+                                  fm=fm,
+                                  rng=rng,
+                                  cfg_strength=cfg_strength)
+            audio = audios.float().cpu()[0]
+            safe_filename = prompt.replace(' ', '_').replace('/', '_').replace('.', '')
+            save_path = output_dir / f'{safe_filename}--numsteps{num_steps}--seed{args.seed}.wav'
+            torchaudio.save(save_path, audio, seq_cfg.sampling_rate)
+            log.info(f'Audio saved to {save_path}')
+        log.info('Memory usage: %.2f GB', torch.cuda.max_memory_allocated() / (2**30))
+if __name__ == '__main__':
+    main()

meanaudio/__init__.py ADDED Viewed

File without changes

meanaudio/data/__init__.py ADDED Viewed

File without changes

meanaudio/data/av_utils.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from dataclasses import dataclass
+from fractions import Fraction
+from pathlib import Path
+from typing import Optional
+import av
+import numpy as np
+import torch
+from av import AudioFrame
+@dataclass
+class VideoInfo:
+    duration_sec: float
+    fps: Fraction
+    clip_frames: torch.Tensor
+    sync_frames: torch.Tensor
+    all_frames: Optional[list[np.ndarray]]
+    @property
+    def height(self):
+        return self.all_frames[0].shape[0]
+    @property
+    def width(self):
+        return self.all_frames[0].shape[1]
+    @classmethod
+    def from_image_info(cls, image_info: 'ImageInfo', duration_sec: float,
+                        fps: Fraction) -> 'VideoInfo':
+        num_frames = int(duration_sec * fps)
+        all_frames = [image_info.original_frame] * num_frames
+        return cls(duration_sec=duration_sec,
+                   fps=fps,
+                   clip_frames=image_info.clip_frames,
+                   sync_frames=image_info.sync_frames,
+                   all_frames=all_frames)
+@dataclass
+class ImageInfo:
+    clip_frames: torch.Tensor
+    sync_frames: torch.Tensor
+    original_frame: Optional[np.ndarray]
+    @property
+    def height(self):
+        return self.original_frame.shape[0]
+    @property
+    def width(self):
+        return self.original_frame.shape[1]
+def read_frames(video_path: Path, list_of_fps: list[float], start_sec: float, end_sec: float,
+                need_all_frames: bool) -> tuple[list[np.ndarray], list[np.ndarray], Fraction]:
+    output_frames = [[] for _ in list_of_fps]
+    next_frame_time_for_each_fps = [0.0 for _ in list_of_fps]
+    time_delta_for_each_fps = [1 / fps for fps in list_of_fps]
+    all_frames = []
+    # container = av.open(video_path)
+    with av.open(video_path) as container:
+        stream = container.streams.video[0]
+        fps = stream.guessed_rate
+        stream.thread_type = 'AUTO'
+        for packet in container.demux(stream):
+            for frame in packet.decode():
+                frame_time = frame.time
+                if frame_time < start_sec:
+                    continue
+                if frame_time > end_sec:
+                    break
+                frame_np = None
+                if need_all_frames:
+                    frame_np = frame.to_ndarray(format='rgb24')
+                    all_frames.append(frame_np)
+                for i, _ in enumerate(list_of_fps):
+                    this_time = frame_time
+                    while this_time >= next_frame_time_for_each_fps[i]:
+                        if frame_np is None:
+                            frame_np = frame.to_ndarray(format='rgb24')
+                        output_frames[i].append(frame_np)
+                        next_frame_time_for_each_fps[i] += time_delta_for_each_fps[i]
+    output_frames = [np.stack(frames) for frames in output_frames]
+    return output_frames, all_frames, fps
+def reencode_with_audio(video_info: VideoInfo, output_path: Path, audio: torch.Tensor,
+                        sampling_rate: int):
+    container = av.open(output_path, 'w')
+    output_video_stream = container.add_stream('h264', video_info.fps)
+    output_video_stream.codec_context.bit_rate = 10 * 1e6  # 10 Mbps
+    output_video_stream.width = video_info.width
+    output_video_stream.height = video_info.height
+    output_video_stream.pix_fmt = 'yuv420p'
+    output_audio_stream = container.add_stream('aac', sampling_rate)
+    # encode video
+    for image in video_info.all_frames:
+        image = av.VideoFrame.from_ndarray(image)
+        packet = output_video_stream.encode(image)
+        container.mux(packet)
+    for packet in output_video_stream.encode():
+        container.mux(packet)
+    # convert float tensor audio to numpy array
+    audio_np = audio.numpy().astype(np.float32)
+    audio_frame = AudioFrame.from_ndarray(audio_np, format='flt', layout='mono')
+    audio_frame.sample_rate = sampling_rate
+    for packet in output_audio_stream.encode(audio_frame):
+        container.mux(packet)
+    for packet in output_audio_stream.encode():
+        container.mux(packet)
+    container.close()
+def remux_with_audio(video_path: Path, audio: torch.Tensor, output_path: Path, sampling_rate: int):
+    """
+    NOTE: I don't think we can get the exact video duration right without re-encoding
+    so we are not using this but keeping it here for reference
+    """
+    video = av.open(video_path)
+    output = av.open(output_path, 'w')
+    input_video_stream = video.streams.video[0]
+    output_video_stream = output.add_stream(template=input_video_stream)
+    output_audio_stream = output.add_stream('aac', sampling_rate)
+    duration_sec = audio.shape[-1] / sampling_rate
+    for packet in video.demux(input_video_stream):
+        # We need to skip the "flushing" packets that `demux` generates.
+        if packet.dts is None:
+            continue
+        # We need to assign the packet to the new stream.
+        packet.stream = output_video_stream
+        output.mux(packet)
+    # convert float tensor audio to numpy array
+    audio_np = audio.numpy().astype(np.float32)
+    audio_frame = av.AudioFrame.from_ndarray(audio_np, format='flt', layout='mono')
+    audio_frame.sample_rate = sampling_rate
+    for packet in output_audio_stream.encode(audio_frame):
+        output.mux(packet)
+    for packet in output_audio_stream.encode():
+        output.mux(packet)
+    video.close()
+    output.close()
+    output.close()

meanaudio/data/data_setup.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import logging
+import random
+import numpy as np
+import torch
+from omegaconf import DictConfig
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.dataloader import default_collate
+from torch.utils.data.distributed import DistributedSampler
+from meanaudio.data.extracted_audio import ExtractedAudio
+from meanaudio.data.mm_dataset import MultiModalDataset
+from meanaudio.utils.dist_utils import local_rank
+log = logging.getLogger()
+# Re-seed randomness every time we start a worker
+def worker_init_fn(worker_id: int):
+    worker_seed = torch.initial_seed() % (2**31) + worker_id + local_rank * 1000
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+    log.debug(f'Worker {worker_id} re-seeded with seed {worker_seed} in rank {local_rank}')
+def load_audio_data(cfg: DictConfig, data_cfg: DictConfig) -> Dataset:
+    dataset = ExtractedAudio(tsv_path=data_cfg.tsv,
+                            concat_text_fc=cfg.concat_text_fc,   # FIX here we determine usage of concat based on global config
+                            data_dim=cfg.data_dim,
+                            npz_dir=data_cfg.npz_dir,
+                            repa_npz_dir=data_cfg.repa_npz_dir,
+                            exclude_cls=cfg.get('exclude_cls', False),
+                            repa_version=cfg.get('repa_version', 1))
+    return dataset
+def setup_training_datasets(cfg: DictConfig) -> tuple[Dataset, DistributedSampler, DataLoader]:
+    if cfg.mini_train:
+        audiocaps_mini = load_audio_data(cfg, cfg.data.AudioCaps_val_npz)  # use val set as the miniset
+        dataset = MultiModalDataset([],
+                                    [audiocaps_mini])
+    else:
+        audiocaps_npz = load_audio_data(cfg, cfg.data.AudioCaps_npz)
+        # !TODO: think of a better way to handle different datasets
+        # freesound1_npz = load_audio_data_npz(cfg, cfg.data.FreeSound1_npz)
+        # freesound2_npz = load_audio_data_npz(cfg, cfg.data.FreeSound2_npz)
+        # freesound3_npz = load_audio_data_npz(cfg, cfg.data.FreeSound3_npz)
+        # audioset_sl_npz = load_audio_data_npz(cfg, cfg.data.AudioSetSL_npz)
+        # bbcsound_npz = load_audio_data_npz(cfg, cfg.data.BBCSound_npz)
+        # clotho_npz = load_audio_data_npz(cfg, cfg.data.Clotho_npz)
+        dataset = MultiModalDataset([], [audiocaps_npz])
+        # dataset = MultiModalDataset([], [audiocaps_npz]*cfg.ac_oversample_rate + [audioset_sl_npz, bbcsound_npz, clotho_npz,
+        #                                                                         freesound1_npz, freesound2_npz, freesound3_npz])
+    batch_size = cfg.batch_size  # per-gpu batch size
+    num_workers = cfg.num_workers
+    pin_memory = cfg.pin_memory
+    sampler, loader = construct_loader(dataset,
+                                       batch_size,
+                                       num_workers,
+                                       shuffle=True,
+                                       drop_last=True,
+                                       pin_memory=pin_memory)
+    return dataset, sampler, loader
+def setup_test_datasets(cfg):  # used in sample
+    dataset = load_audio_data(cfg, cfg.data.AudioCaps_test_npz)  # ALL with NPZ format
+    batch_size = cfg.eval_batch_size  # FIX: from train config
+    num_workers = cfg.num_workers
+    pin_memory = cfg.pin_memory
+    sampler, loader = construct_loader(dataset,
+                                       batch_size,
+                                       num_workers,
+                                       shuffle=False,
+                                       drop_last=False,
+                                       pin_memory=pin_memory)
+    return dataset, sampler, loader
+def setup_val_datasets(cfg: DictConfig) -> tuple[Dataset, DataLoader, DataLoader]:
+    dataset = load_audio_data(cfg, cfg.data.AudioCaps_val_npz)
+    val_batch_size = cfg.batch_size
+    val_eval_batch_size = cfg.eval_batch_size
+    num_workers = cfg.num_workers
+    pin_memory = cfg.pin_memory
+    _, val_loader = construct_loader(dataset,
+                                     val_batch_size,
+                                     num_workers,
+                                     shuffle=False,
+                                     drop_last=False,
+                                     pin_memory=pin_memory)
+    _, eval_loader = construct_loader(dataset,
+                                      val_eval_batch_size,
+                                      num_workers,
+                                      shuffle=False,
+                                      drop_last=False,
+                                      pin_memory=pin_memory)
+    return dataset, val_loader, eval_loader
+def error_avoidance_collate(batch):
+    batch = list(filter(lambda x: x is not None, batch))   # batch = [x for x in batch if x is not None]
+    return default_collate(batch)
+def construct_loader(dataset: Dataset,
+                     batch_size: int,
+                     num_workers: int,
+                     *,
+                     shuffle: bool = True,
+                     drop_last: bool = True,
+                     pin_memory: bool = False,
+                     error_avoidance: bool = False) -> tuple[DistributedSampler, DataLoader]:
+    train_sampler = DistributedSampler(dataset, rank=local_rank, shuffle=shuffle)
+    train_loader = DataLoader(dataset,
+                              batch_size,
+                              sampler=train_sampler,
+                              num_workers=num_workers,
+                              worker_init_fn=worker_init_fn,
+                              drop_last=drop_last,
+                              persistent_workers=num_workers > 0,
+                              pin_memory=pin_memory,
+                              collate_fn=error_avoidance_collate if error_avoidance else None)
+    return train_sampler, train_loader

meanaudio/data/eval/__init__.py ADDED Viewed

File without changes

meanaudio/data/eval/audiocaps.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import logging
+import os
+from collections import defaultdict
+from pathlib import Path
+from typing import Union
+import pandas as pd
+import torch
+from torch.utils.data.dataset import Dataset
+log = logging.getLogger()
+class AudioCapsData(Dataset):
+    def __init__(self, audio_path: Union[str, Path], csv_path: Union[str, Path]):
+        df = pd.read_csv(csv_path).to_dict(orient='records')
+        audio_files = sorted(os.listdir(audio_path))
+        audio_files = set(
+            [Path(f).stem for f in audio_files if f.endswith('.wav') or f.endswith('.flac')])
+        self.data = []
+        for row in df:
+            self.data.append({
+                'name': row['name'],
+                'caption': row['caption'],
+            })
+        self.audio_path = Path(audio_path)
+        self.csv_path = Path(csv_path)
+        log.info(f'Found {len(self.data)} matching audio files in {self.audio_path}')
+    def __getitem__(self, idx: int) -> torch.Tensor:
+        return self.data[idx]
+    def __len__(self):
+        return len(self.data)

meanaudio/data/eval/moviegen.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import json
+import logging
+import os
+from pathlib import Path
+from typing import Union
+import torch
+from torch.utils.data.dataset import Dataset
+from torchvision.transforms import v2
+from torio.io import StreamingMediaDecoder
+from mmaudio.utils.dist_utils import local_rank
+log = logging.getLogger()
+_CLIP_SIZE = 384
+_CLIP_FPS = 8.0
+_SYNC_SIZE = 224
+_SYNC_FPS = 25.0
+class MovieGenData(Dataset):
+    def __init__(
+        self,
+        video_root: Union[str, Path],
+        sync_root: Union[str, Path],
+        jsonl_root: Union[str, Path],
+        *,
+        duration_sec: float = 10.0,
+        read_clip: bool = True,
+    ):
+        self.video_root = Path(video_root)
+        self.sync_root = Path(sync_root)
+        self.jsonl_root = Path(jsonl_root)
+        self.read_clip = read_clip
+        videos = sorted(os.listdir(self.video_root))
+        videos = [v[:-4] for v in videos]  # remove extensions
+        self.captions = {}
+        for v in videos:
+            with open(self.jsonl_root / (v + '.jsonl')) as f:
+                data = json.load(f)
+                self.captions[v] = data['audio_prompt']
+        if local_rank == 0:
+            log.info(f'{len(videos)} videos found in {video_root}')
+        self.duration_sec = duration_sec
+        self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
+        self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
+        self.clip_augment = v2.Compose([
+            v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+        ])
+        self.sync_augment = v2.Compose([
+            v2.Resize((_SYNC_SIZE, _SYNC_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
+            v2.CenterCrop(_SYNC_SIZE),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+        self.videos = videos
+    def sample(self, idx: int) -> dict[str, torch.Tensor]:
+        video_id = self.videos[idx]
+        caption = self.captions[video_id]
+        reader = StreamingMediaDecoder(self.video_root / (video_id + '.mp4'))
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
+            frame_rate=_CLIP_FPS,
+            format='rgb24',
+        )
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
+            frame_rate=_SYNC_FPS,
+            format='rgb24',
+        )
+        reader.fill_buffer()
+        data_chunk = reader.pop_chunks()
+        clip_chunk = data_chunk[0]
+        sync_chunk = data_chunk[1]
+        if clip_chunk is None:
+            raise RuntimeError(f'CLIP video returned None {video_id}')
+        if clip_chunk.shape[0] < self.clip_expected_length:
+            raise RuntimeError(f'CLIP video too short {video_id}')
+        if sync_chunk is None:
+            raise RuntimeError(f'Sync video returned None {video_id}')
+        if sync_chunk.shape[0] < self.sync_expected_length:
+            raise RuntimeError(f'Sync video too short {video_id}')
+        # truncate the video
+        clip_chunk = clip_chunk[:self.clip_expected_length]
+        if clip_chunk.shape[0] != self.clip_expected_length:
+            raise RuntimeError(f'CLIP video wrong length {video_id}, '
+                               f'expected {self.clip_expected_length}, '
+                               f'got {clip_chunk.shape[0]}')
+        clip_chunk = self.clip_augment(clip_chunk)
+        sync_chunk = sync_chunk[:self.sync_expected_length]
+        if sync_chunk.shape[0] != self.sync_expected_length:
+            raise RuntimeError(f'Sync video wrong length {video_id}, '
+                               f'expected {self.sync_expected_length}, '
+                               f'got {sync_chunk.shape[0]}')
+        sync_chunk = self.sync_augment(sync_chunk)
+        data = {
+            'name': video_id,
+            'caption': caption,
+            'clip_video': clip_chunk,
+            'sync_video': sync_chunk,
+        }
+        return data
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        return self.sample(idx)
+    def __len__(self):
+        return len(self.captions)

meanaudio/data/eval/video_dataset.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import json
+import logging
+import os
+from pathlib import Path
+from typing import Union
+import pandas as pd
+import torch
+from torch.utils.data.dataset import Dataset
+from torchvision.transforms import v2
+from torio.io import StreamingMediaDecoder
+from mmaudio.utils.dist_utils import local_rank
+log = logging.getLogger()
+_CLIP_SIZE = 384
+_CLIP_FPS = 8.0
+_SYNC_SIZE = 224
+_SYNC_FPS = 25.0
+class VideoDataset(Dataset):
+    def __init__(
+        self,
+        video_root: Union[str, Path],
+        *,
+        duration_sec: float = 8.0,
+    ):
+        self.video_root = Path(video_root)
+        self.duration_sec = duration_sec
+        self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
+        self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
+        self.clip_transform = v2.Compose([
+            v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+        ])
+        self.sync_transform = v2.Compose([
+            v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
+            v2.CenterCrop(_SYNC_SIZE),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+        # to be implemented by subclasses
+        self.captions = {}
+        self.videos = sorted(list(self.captions.keys()))
+    def sample(self, idx: int) -> dict[str, torch.Tensor]:
+        video_id = self.videos[idx]
+        caption = self.captions[video_id]
+        reader = StreamingMediaDecoder(self.video_root / (video_id + '.mp4'))
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
+            frame_rate=_CLIP_FPS,
+            format='rgb24',
+        )
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
+            frame_rate=_SYNC_FPS,
+            format='rgb24',
+        )
+        reader.fill_buffer()
+        data_chunk = reader.pop_chunks()
+        clip_chunk = data_chunk[0]
+        sync_chunk = data_chunk[1]
+        if clip_chunk is None:
+            raise RuntimeError(f'CLIP video returned None {video_id}')
+        if clip_chunk.shape[0] < self.clip_expected_length:
+            raise RuntimeError(
+                f'CLIP video too short {video_id}, expected {self.clip_expected_length}, got {clip_chunk.shape[0]}'
+            )
+        if sync_chunk is None:
+            raise RuntimeError(f'Sync video returned None {video_id}')
+        if sync_chunk.shape[0] < self.sync_expected_length:
+            raise RuntimeError(
+                f'Sync video too short {video_id}, expected {self.sync_expected_length}, got {sync_chunk.shape[0]}'
+            )
+        # truncate the video
+        clip_chunk = clip_chunk[:self.clip_expected_length]
+        if clip_chunk.shape[0] != self.clip_expected_length:
+            raise RuntimeError(f'CLIP video wrong length {video_id}, '
+                               f'expected {self.clip_expected_length}, '
+                               f'got {clip_chunk.shape[0]}')
+        clip_chunk = self.clip_transform(clip_chunk)
+        sync_chunk = sync_chunk[:self.sync_expected_length]
+        if sync_chunk.shape[0] != self.sync_expected_length:
+            raise RuntimeError(f'Sync video wrong length {video_id}, '
+                               f'expected {self.sync_expected_length}, '
+                               f'got {sync_chunk.shape[0]}')
+        sync_chunk = self.sync_transform(sync_chunk)
+        data = {
+            'name': video_id,
+            'caption': caption,
+            'clip_video': clip_chunk,
+            'sync_video': sync_chunk,
+        }
+        return data
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        try:
+            return self.sample(idx)
+        except Exception as e:
+            log.error(f'Error loading video {self.videos[idx]}: {e}')
+            return None
+    def __len__(self):
+        return len(self.captions)
+class VGGSound(VideoDataset):
+    def __init__(
+        self,
+        video_root: Union[str, Path],
+        csv_path: Union[str, Path],
+        *,
+        duration_sec: float = 8.0,
+    ):
+        super().__init__(video_root, duration_sec=duration_sec)
+        self.video_root = Path(video_root)
+        self.csv_path = Path(csv_path)
+        videos = sorted(os.listdir(self.video_root))
+        if local_rank == 0:
+            log.info(f'{len(videos)} videos found in {video_root}')
+        self.captions = {}
+        df = pd.read_csv(csv_path, header=None, names=['id', 'sec', 'caption',
+                                                       'split']).to_dict(orient='records')
+        videos_no_found = []
+        for row in df:
+            if row['split'] == 'test':
+                start_sec = int(row['sec'])
+                video_id = str(row['id'])
+                # this is how our videos are named
+                video_name = f'{video_id}_{start_sec:06d}'
+                if video_name + '.mp4' not in videos:
+                    videos_no_found.append(video_name)
+                    continue
+                self.captions[video_name] = row['caption']
+        if local_rank == 0:
+            log.info(f'{len(videos)} videos found in {video_root}')
+            log.info(f'{len(self.captions)} useable videos found')
+            if videos_no_found:
+                log.info(f'{len(videos_no_found)} found in {csv_path} but not in {video_root}')
+                log.info(
+                    'A small amount is expected, as not all videos are still available on YouTube')
+        self.videos = sorted(list(self.captions.keys()))
+class MovieGen(VideoDataset):
+    def __init__(
+        self,
+        video_root: Union[str, Path],
+        jsonl_root: Union[str, Path],
+        *,
+        duration_sec: float = 10.0,
+    ):
+        super().__init__(video_root, duration_sec=duration_sec)
+        self.video_root = Path(video_root)
+        self.jsonl_root = Path(jsonl_root)
+        videos = sorted(os.listdir(self.video_root))
+        videos = [v[:-4] for v in videos]  # remove extensions
+        self.captions = {}
+        for v in videos:
+            with open(self.jsonl_root / (v + '.jsonl')) as f:
+                data = json.load(f)
+                self.captions[v] = data['audio_prompt']
+        if local_rank == 0:
+            log.info(f'{len(videos)} videos found in {video_root}')
+        self.videos = videos

meanaudio/data/extracted_audio.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import logging
+from pathlib import Path
+from typing import Union, Optional
+import pandas as pd
+import torch
+from tensordict import TensorDict
+from torch.utils.data.dataset import Dataset
+from torch.utils.data import DataLoader
+from meanaudio.utils.dist_utils import local_rank
+import numpy as np
+import glob
+import torch.nn.functional as F
+log = logging.getLogger()
+class ExtractedAudio(Dataset):
+    def __init__(
+        self,
+        tsv_path: Union[str, Path],
+        *,
+        concat_text_fc: bool,
+        npz_dir: Union[str, Path],
+        data_dim: dict[str, int],
+        repa_npz_dir: Optional[Union[str, Path]],   # if passed, repa features (zs) would be returned
+        exclude_cls: Optional[bool],
+        repa_version: Optional[int],
+    ):
+        super().__init__()
+        self.data_dim = data_dim
+        self.df_list = pd.read_csv(tsv_path, sep='\t').to_dict('records') # id, caption
+        self.ids = [str(d['id']) for d in self.df_list]
+        npz_files = glob.glob(f"{npz_dir}/*.npz")
+        self.concat_text_fc = concat_text_fc
+        self.exclude_cls = exclude_cls
+        self.repa_version = repa_version
+        if self.concat_text_fc:
+            log.info(f'We will concat the pooled text_features and text_features_c for text condition')
+        # dimension check
+        sample = np.load(f'{npz_dir}/0.npz')
+        mean_s = [len(npz_files)] + list(sample['mean'].shape)
+        std_s = [len(npz_files)] + list(sample['std'].shape)
+        text_features_s = [len(npz_files)] + list(sample['text_features'].shape)
+        text_features_c_s = [len(npz_files)] + list(sample['text_features_c'].shape)
+        if self.concat_text_fc:
+            text_features_c_s[-1] = text_features_c_s[-1] + text_features_s[-1]
+        log.info(f'Loading {len(npz_files)} npz files from {npz_dir}')
+        log.info(f'Loaded mean: {mean_s}.')
+        log.info(f'Loaded std: {std_s}.')
+        log.info(f'Loaded text features: {text_features_s}.')
+        log.info(f'Loaded text features_c: {text_features_c_s}.')
+        assert len(npz_files) == len(self.df_list), 'Number mismatch between npz files and tsv items'
+        assert mean_s[1] == self.data_dim['latent_seq_len'], \
+            f'{mean_s[1]} != {self.data_dim["latent_seq_len"]}'
+        assert std_s[1] == self.data_dim['latent_seq_len'], \
+            f'{std_s[1]} != {self.data_dim["latent_seq_len"]}'
+        assert text_features_s[1] == self.data_dim['text_seq_len'], \
+            f'{text_features_s[1]} != {self.data_dim["text_seq_len"]}'
+        assert text_features_s[-1] == self.data_dim['text_dim'], \
+            f'{text_features_s[-1]} != {self.data_dim["text_dim"]}'
+        assert text_features_c_s[-1] == self.data_dim['text_c_dim'], \
+            f'{text_features_c_s[-1]} != {self.data_dim["text_c_dim"]}'
+        self.npz_dir = npz_dir
+        if repa_npz_dir != None:
+            self.repa_npz_dir = repa_npz_dir
+            sample = np.load(f'{repa_npz_dir}/0.npz')
+            repa_npz_files = glob.glob(f"{repa_npz_dir}/*.npz")
+            log.info(f'Loading {len(repa_npz_files)} npz representations from {repa_npz_dir}')
+            es_s = [len(repa_npz_files)] + list(sample['es'].shape)
+            if self.repa_version == 2:
+                es_s[1] = 65  # ad-hoc 8x downsampling for EAT
+            elif self.repa_version == 3:
+                es_s[1] = 1   # we only use cls token for alignment
+            else:
+                if self.exclude_cls:
+                    es_s[1] = es_s[1] - 1
+            log.info(f'Loaded es: {es_s}')
+            assert len(repa_npz_files) == len(npz_files), 'Number mismatch between repa npz files and latent npz files'
+            assert es_s[1] == self.data_dim['repa_seq_len'], \
+                f'{es_s[1]} != {self.data_dim["repa_seq_len"]}'
+            assert es_s[-1] == self.data_dim['repa_seq_dim'], \
+                f'{es_s[-1]} != {self.data_dim["repa_seq_dim"]}'
+        else:
+            self.repa_npz_dir = None
+    def compute_latent_stats(self) -> tuple[torch.Tensor, torch.Tensor]:
+        # !TODO here we may consider load pre-computed latent mean & std
+        raise NotImplementedError('Please manually compute latent stats outside. ')
+    def __getitem__(self, idx):
+        npz_path = f'{self.npz_dir}/{idx}.npz'
+        np_data = np.load(npz_path)
+        text_features = torch.from_numpy(np_data['text_features'])
+        text_features_c = torch.from_numpy(np_data['text_features_c'])
+        if self.concat_text_fc:
+            text_features_c = torch.cat([text_features.mean(dim=-2),
+                                         text_features_c], dim=-1)   # [b, d+d_c]
+        out_dict = {
+            'id': str(self.df_list[idx]['id']),
+            'a_mean': torch.from_numpy(np_data['mean']),
+            'a_std': torch.from_numpy(np_data['std']),
+            'text_features': text_features,
+            'text_features_c': text_features_c,
+            'caption': self.df_list[idx]['caption'],
+        }
+        if self.repa_npz_dir != None:
+            repa_npz_path = f'{self.repa_npz_dir}/{idx}.npz'
+            repa_np_data = np.load(repa_npz_path)
+            zs =  torch.from_numpy(repa_np_data['es'])
+            if self.repa_version == 1:
+                if self.exclude_cls:
+                    zs = zs[1:,:]
+            if self.repa_version == 2:
+                z_cls = zs[0]  # (dim)
+                # zs = zs[1:,:].view(64, 8, 768)
+                zs = F.avg_pool2d(zs[1:,:].unsqueeze(0),
+                                  kernel_size=(8, 1),
+                                  stride=(8, 1)).squeeze()  # (64, 768)
+                zs = torch.cat((z_cls.unsqueeze(0), zs), dim=0)
+            elif self.repa_version == 3:  # cls token
+                zs = zs[0].unsqueeze(0)
+            out_dict['zs'] = zs  #!TODO Here field is WRONG for eat features (should be zs)
+        return out_dict
+    def __len__(self):
+        return len(self.ids)
+if __name__ == '__main__':
+    from meanaudio.utils.dist_utils import info_if_rank_zero, local_rank, world_size
+    import torch.distributed as distributed
+    from datetime import timedelta
+    from torch.utils.data.distributed import DistributedSampler
+    def distributed_setup():
+        distributed.init_process_group(backend="nccl", timeout=timedelta(hours=2))
+        log.info(f'Initialized: local_rank={local_rank}, world_size={world_size}')
+        return local_rank, world_size
+    distributed_setup()
+    tsv_path = '/hpc_stor03/sjtu_home/xiquan.li/TTA/MMAudio/training/audiocaps/train-memmap-t5-clap.tsv'
+    data_dim = {'latent_seq_len': 312,
+                'text_seq_len': 77,
+                'text_dim': 1024,
+                'text_c_dim': 512}
+    dataset = ExtractedAudio(tsv_path=tsv_path,
+                                    npz_dir=npz_dir,
+                                    data_dim=data_dim)
+    loader = DataLoader(dataset,
+                        16,
+                        num_workers=8,
+                        persistent_workers=8,
+                        pin_memory=False)
+    train_sampler = DistributedSampler(dataset, rank=local_rank, shuffle=True)
+    for b in loader:
+        print(b['a_mean'].shape)
+        break

meanaudio/data/extraction/__init__.py ADDED Viewed

File without changes

meanaudio/data/extraction/vgg_sound.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import logging
+import os
+from pathlib import Path
+from typing import Optional, Union
+import pandas as pd
+import torch
+import torchaudio
+from torch.utils.data.dataset import Dataset
+from torchvision.transforms import v2
+from torio.io import StreamingMediaDecoder
+from mmaudio.utils.dist_utils import local_rank
+log = logging.getLogger()
+_CLIP_SIZE = 384
+_CLIP_FPS = 8.0
+_SYNC_SIZE = 224
+_SYNC_FPS = 25.0
+class VGGSound(Dataset):
+    def __init__(
+        self,
+        root: Union[str, Path],
+        *,
+        tsv_path: Union[str, Path] = 'sets/vgg3-train.tsv',
+        sample_rate: int = 16_000,
+        duration_sec: float = 8.0,
+        audio_samples: Optional[int] = None,
+        normalize_audio: bool = False,
+    ):
+        self.root = Path(root)
+        self.normalize_audio = normalize_audio
+        if audio_samples is None:
+            self.audio_samples = int(sample_rate * duration_sec)
+        else:
+            self.audio_samples = audio_samples
+            effective_duration = audio_samples / sample_rate
+            # make sure the duration is close enough, within 15ms
+            assert abs(effective_duration - duration_sec) < 0.015, \
+                f'audio_samples {audio_samples} does not match duration_sec {duration_sec}'
+        print("Loading videos started")
+        videos = sorted(os.listdir(self.root))
+        videos = set([Path(v).stem for v in videos])  # remove extensions
+        print("Loading videos ended")
+        self.labels = {}
+        self.videos = []
+        missing_videos = []
+        # read the tsv for subset information
+        df_list = pd.read_csv(tsv_path, sep='\t', dtype={'id': str}).to_dict('records')
+        for record in df_list:
+            id = record['id']
+            label = record['label']
+            if id in videos:
+                self.labels[id] = label
+                self.videos.append(id)
+            else:
+                missing_videos.append(id)
+        if local_rank == 0:
+            log.info(f'{len(videos)} videos found in {root}')
+            log.info(f'{len(self.videos)} videos found in {tsv_path}')
+            log.info(f'{len(missing_videos)} videos missing in {root}')
+        self.sample_rate = sample_rate
+        self.duration_sec = duration_sec
+        self.expected_audio_length = audio_samples
+        self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
+        self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
+        self.clip_transform = v2.Compose([
+            v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+        ])
+        self.sync_transform = v2.Compose([
+            v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
+            v2.CenterCrop(_SYNC_SIZE),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+        self.resampler = {}
+    def sample(self, idx: int) -> dict[str, torch.Tensor]:
+        video_id = self.videos[idx]
+        label = self.labels[video_id]
+        reader = StreamingMediaDecoder(self.root / (video_id + '.mp4'))
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
+            frame_rate=_CLIP_FPS,
+            format='rgb24',
+        )
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
+            frame_rate=_SYNC_FPS,
+            format='rgb24',
+        )
+        reader.add_basic_audio_stream(frames_per_chunk=2**30, )
+        reader.fill_buffer()
+        data_chunk = reader.pop_chunks()
+        clip_chunk = data_chunk[0]
+        sync_chunk = data_chunk[1]
+        audio_chunk = data_chunk[2]
+        if clip_chunk is None:
+            raise RuntimeError(f'CLIP video returned None {video_id}')
+        if clip_chunk.shape[0] < self.clip_expected_length:
+            raise RuntimeError(
+                f'CLIP video too short {video_id}, expected {self.clip_expected_length}, got {clip_chunk.shape[0]}'
+            )
+        if sync_chunk is None:
+            raise RuntimeError(f'Sync video returned None {video_id}')
+        if sync_chunk.shape[0] < self.sync_expected_length:
+            raise RuntimeError(
+                f'Sync video too short {video_id}, expected {self.sync_expected_length}, got {sync_chunk.shape[0]}'
+            )
+        # process audio
+        sample_rate = int(reader.get_out_stream_info(2).sample_rate)
+        audio_chunk = audio_chunk.transpose(0, 1)
+        audio_chunk = audio_chunk.mean(dim=0)  # mono
+        if self.normalize_audio:
+            abs_max = audio_chunk.abs().max()
+            audio_chunk = audio_chunk / abs_max * 0.95
+            if abs_max <= 1e-6:
+                raise RuntimeError(f'Audio is silent {video_id}')
+        # resample
+        if sample_rate == self.sample_rate:
+            audio_chunk = audio_chunk
+        else:
+            if sample_rate not in self.resampler:
+                # https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html#kaiser-best
+                self.resampler[sample_rate] = torchaudio.transforms.Resample(
+                    sample_rate,
+                    self.sample_rate,
+                    lowpass_filter_width=64,
+                    rolloff=0.9475937167399596,
+                    resampling_method='sinc_interp_kaiser',
+                    beta=14.769656459379492,
+                )
+            audio_chunk = self.resampler[sample_rate](audio_chunk)
+        if audio_chunk.shape[0] < self.expected_audio_length:
+            raise RuntimeError(f'Audio too short {video_id}')
+        audio_chunk = audio_chunk[:self.expected_audio_length]
+        # truncate the video
+        clip_chunk = clip_chunk[:self.clip_expected_length]
+        if clip_chunk.shape[0] != self.clip_expected_length:
+            raise RuntimeError(f'CLIP video wrong length {video_id}, '
+                               f'expected {self.clip_expected_length}, '
+                               f'got {clip_chunk.shape[0]}')
+        clip_chunk = self.clip_transform(clip_chunk)
+        sync_chunk = sync_chunk[:self.sync_expected_length]
+        if sync_chunk.shape[0] != self.sync_expected_length:
+            raise RuntimeError(f'Sync video wrong length {video_id}, '
+                               f'expected {self.sync_expected_length}, '
+                               f'got {sync_chunk.shape[0]}')
+        sync_chunk = self.sync_transform(sync_chunk)
+        data = {
+            'id': video_id,
+            'caption': label,
+            'audio': audio_chunk,
+            'clip_video': clip_chunk,
+            'sync_video': sync_chunk,
+        }
+        return data
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        try:
+            return self.sample(idx)
+        except Exception as e:
+            log.error(f'Error loading video {self.videos[idx]}: {e}')
+            return None
+    def __len__(self):
+        return len(self.labels)

meanaudio/data/extraction/wav_dataset.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import logging
+import os
+from pathlib import Path
+from typing import Union
+import open_clip
+import pandas as pd
+import torch
+import torchaudio
+from torch.utils.data.dataset import Dataset
+import torch.nn.functional as F
+log = logging.getLogger()
+class WavTextClipsDataset(Dataset):
+    def __init__(
+        self,
+        root: Union[str, Path],
+        *,
+        captions_tsv: Union[str, Path],
+        clips_tsv: Union[str, Path],
+        sample_rate: int,
+        num_samples: int,
+        duration: int = 10,
+        normalize_audio: bool = False,
+        reject_silent: bool = False,
+        tokenizer_id: str = 'ViT-H-14-378-quickgelu',
+        multi_caption: bool = False
+    ):
+        self.root = Path(root)
+        self.sample_rate = sample_rate
+        self.num_samples = num_samples
+        self.normalize_audio = normalize_audio
+        self.reject_silent = reject_silent
+        self.duration = duration
+        self.tokenizer = open_clip.get_tokenizer(tokenizer_id)  # only for clip, for t5 and clap we will get caption embeddings outside
+        audios = sorted(os.listdir(self.root))
+        audios = set([
+            Path(audio).stem for audio in audios  # file name w/o extension
+            if audio.endswith('.wav') or audio.endswith('.flac')
+        ])
+        self.captions = {}
+        # read the caption tsv
+        df_list = pd.read_csv(captions_tsv, sep='\t', dtype={'id': str}).to_dict('records')
+        for record in df_list:
+            id = record['id']  # file name
+            caption = record['caption']
+            if not multi_caption:
+                self.captions[id] = caption  # captions: {name(no partition index): caption} !Only ONE caption will be selected for an audio clip
+            else:
+                if id not in self.captions.keys():
+                    self.captions[id] = [caption]
+                else:
+                    self.captions[id].append(caption)
+        # read the clip tsv
+        df_list = pd.read_csv(clips_tsv, sep='\t', dtype={
+            'id': str,
+            'name': str
+        }).to_dict('records')
+        self.clips = []
+        for record in df_list:  # partition
+            name = record['name']
+            if name not in self.captions:
+                log.warning(f'Audio {name} not found in {captions_tsv}')
+                continue
+            if not multi_caption:
+                record['caption'] = self.captions[name]
+                self.clips.append(record)  # add caption to partition csv
+            else:
+                for caption in self.captions[name]:
+                    r = record.copy()
+                    r['caption'] = caption
+                    self.clips.append(r)  # add caption to partition csv
+        log.info(f'Found {len(self.clips)} audio files in {self.root}')
+        self.resampler = {}
+    def __getitem__(self, idx: int) -> torch.Tensor:
+        try:
+            clip = self.clips[idx]
+            audio_name = clip['name']
+            audio_id = clip['id']
+            caption = clip['caption']
+            start_sample = clip['start_sample']
+            end_sample = clip['end_sample']
+            audio_path = self.root / f'{audio_name}.flac'
+            if not audio_path.exists():
+                audio_path = self.root / f'{audio_name}.wav'
+                assert audio_path.exists()
+            audio_chunk, sample_rate = torchaudio.load(audio_path)
+            audio_chunk = audio_chunk.mean(dim=0)  # mono
+            abs_max = audio_chunk.abs().max()
+            if self.normalize_audio:
+                audio_chunk = audio_chunk / abs_max * 0.95
+            if self.reject_silent and abs_max < 1e-6:
+                log.warning(f'Rejecting silent audio')
+                return None
+            if audio_chunk.size(0) < end_sample:
+                audio_chunk = F.pad(
+                    audio_chunk,
+                    (0, end_sample - audio_chunk.size(0)),
+                    mode='constant',
+                    value=0
+                )
+            else:
+                audio_chunk = audio_chunk[start_sample:end_sample]
+            # resample
+            if sample_rate == self.sample_rate:
+                audio_chunk = audio_chunk
+            else:
+                if sample_rate not in self.resampler:
+                    # https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html#kaiser-best
+                    self.resampler[sample_rate] = torchaudio.transforms.Resample(
+                        sample_rate,
+                        self.sample_rate,
+                        lowpass_filter_width=64,
+                        rolloff=0.9475937167399596,
+                        resampling_method='sinc_interp_kaiser',
+                        beta=14.769656459379492,
+                    )
+                audio_chunk = self.resampler[sample_rate](audio_chunk)
+            if audio_chunk.shape[0] < self.num_samples:
+                raise ValueError('Audio is too short')
+            audio_chunk = audio_chunk[:self.num_samples]
+            tokens = self.tokenizer([caption])[0]
+            output = {
+                'waveform': audio_chunk,
+                'id': audio_id,
+                'caption': caption,
+                'tokens': tokens,
+            }
+            return output
+        except Exception as e:
+            log.error(f'Error reading {audio_path}: {e}')
+            return None
+    def __len__(self):
+        return len(self.clips)

meanaudio/data/mm_dataset.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import bisect
+import torch
+from torch.utils.data.dataset import Dataset
+# modified from https://pytorch.org/docs/stable/_modules/torch/utils/data/dataset.html#ConcatDataset
+class MultiModalDataset(Dataset):
+    datasets: list[Dataset]
+    cumulative_sizes: list[int]
+    @staticmethod
+    def cumsum(sequence):
+        r, s = [], 0
+        for e in sequence:
+            l = len(e)
+            r.append(l + s)
+            s += l
+        return r
+    def __init__(self, video_datasets: list[Dataset], audio_datasets: list[Dataset]):
+        super().__init__()
+        self.video_datasets = list(video_datasets) if video_datasets else []
+        self.audio_datasets = list(audio_datasets) if audio_datasets else []
+        self.datasets = self.video_datasets + self.audio_datasets
+        self.cumulative_sizes = self.cumsum(self.datasets)
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+    def __getitem__(self, idx):
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError("absolute value of index should not exceed dataset length")
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)  # which dataset idx falls into
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx][sample_idx]
+    def compute_latent_stats(self) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.video_datasets == []:
+            raise NotImplementedError(f'This function should not be called for audio-text dataset',
+                                       'Please load latents stats manually instead')
+            return self.audio_datasets[0].compute_latent_stats()  # audio-text training
+        else:
+            return self.video_datasets[0].compute_latent_stats()  # video-text training

meanaudio/data/utils.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import logging
+import os
+import random
+import tempfile
+from pathlib import Path
+from typing import Any, Optional, Union
+import torch
+import torch.distributed as dist
+from tensordict import MemoryMappedTensor
+from torch.utils.data import DataLoader
+from torch.utils.data.dataset import Dataset
+from tqdm import tqdm
+from meanaudio.utils.dist_utils import local_rank, world_size
+scratch_path = Path(os.environ['SLURM_SCRATCH'] if 'SLURM_SCRATCH' in os.environ else '/dev/shm')
+shm_path = Path('/dev/shm')
+log = logging.getLogger()
+def reseed(seed):
+    random.seed(seed)
+    torch.manual_seed(seed)
+def local_scatter_torch(obj: Optional[Any]):
+    if world_size == 1:
+        # Just one worker. Do nothing.
+        return obj
+    array = [obj] * world_size
+    target_array = [None]
+    if local_rank == 0:
+        dist.scatter_object_list(target_array, scatter_object_input_list=array, src=0)
+    else:
+        dist.scatter_object_list(target_array, scatter_object_input_list=None, src=0)
+    return target_array[0]
+class ShardDataset(Dataset):
+    def __init__(self, root):
+        self.root = root
+        self.shards = sorted(os.listdir(root))
+    def __len__(self):
+        return len(self.shards)
+    def __getitem__(self, idx):
+        return torch.load(os.path.join(self.root, self.shards[idx]), weights_only=True)
+def get_tmp_dir(in_memory: bool) -> Path:
+    return shm_path if in_memory else scratch_path
+def load_shards_and_share(data_path: Union[str, Path], ids: list[int],
+                          in_memory: bool) -> MemoryMappedTensor:
+    if local_rank == 0:
+        with tempfile.NamedTemporaryFile(prefix='shared-tensor-', dir=get_tmp_dir(in_memory)) as f:
+            log.info(f'Loading shards from {data_path} into {f.name}...')
+            data = load_shards(data_path, ids=ids, tmp_file_path=f.name)
+            data = share_tensor_to_all(data)
+            torch.distributed.barrier()
+            f.close()  # why does the context manager not close the file for me?
+    else:
+        log.info('Waiting for the data to be shared with me...')
+        data = share_tensor_to_all(None)
+        torch.distributed.barrier()
+    return data
+def load_shards(
+    data_path: Union[str, Path],
+    ids: list[int],
+    *,
+    tmp_file_path: str,
+) -> Union[torch.Tensor, dict[str, torch.Tensor]]:
+    id_set = set(ids)
+    shards = sorted(os.listdir(data_path))
+    log.info(f'Found {len(shards)} shards in {data_path}.')
+    first_shard = torch.load(os.path.join(data_path, shards[0]), weights_only=True)
+    log.info(f'Rank {local_rank} created file {tmp_file_path}')
+    first_item = next(iter(first_shard.values()))
+    log.info(f'First item shape: {first_item.shape}')
+    mm_tensor = MemoryMappedTensor.empty(shape=(len(ids), *first_item.shape),
+                                         dtype=torch.float32,
+                                         filename=tmp_file_path,
+                                         existsok=True)
+    total_count = 0
+    used_index = set()
+    id_indexing = {i: idx for idx, i in enumerate(ids)}
+    # faster with no workers; otherwise we need to set_sharing_strategy('file_system')
+    loader = DataLoader(ShardDataset(data_path), batch_size=1, num_workers=0)
+    for data in tqdm(loader, desc='Loading shards'):
+        for i, v in data.items():
+            if i not in id_set:
+                continue
+            # tensor_index = ids.index(i)
+            tensor_index = id_indexing[i]
+            if tensor_index in used_index:
+                raise ValueError(f'Duplicate id {i} found in {data_path}.')
+            used_index.add(tensor_index)
+            mm_tensor[tensor_index] = v
+            total_count += 1
+    assert total_count == len(ids), f'Expected {len(ids)} tensors, got {total_count}.'
+    log.info(f'Loaded {total_count} tensors from {data_path}.')
+    return mm_tensor
+def share_tensor_to_all(x: Optional[MemoryMappedTensor]) -> MemoryMappedTensor:
+    """
+    x: the tensor to be shared; None if local_rank != 0
+    return: the shared tensor
+    """
+    # there is no need to share your stuff with anyone if you are alone; must be in memory
+    if world_size == 1:
+        return x
+    if local_rank == 0:
+        assert x is not None, 'x must not be None if local_rank == 0'
+    else:
+        assert x is None, 'x must be None if local_rank != 0'
+    if local_rank == 0:
+        filename = x.filename
+        meta_information = (filename, x.shape, x.dtype)
+    else:
+        meta_information = None
+    filename, data_shape, data_type = local_scatter_torch(meta_information)
+    if local_rank == 0:
+        data = x
+    else:
+        data = MemoryMappedTensor.from_filename(filename=filename,
+                                                dtype=data_type,
+                                                shape=data_shape)
+    return data

meanaudio/eval_utils.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import dataclasses
+import logging
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import torch
+from colorlog import ColoredFormatter
+from PIL import Image
+from torchvision.transforms import v2
+from meanaudio.data.av_utils import ImageInfo, VideoInfo, read_frames, reencode_with_audio
+from meanaudio.model.flow_matching import FlowMatching
+from meanaudio.model.mean_flow import MeanFlow
+from meanaudio.model.networks import MeanAudio, FluxAudio
+from meanaudio.model.sequence_config import CONFIG_16K, CONFIG_44K, SequenceConfig
+from meanaudio.model.utils.features_utils import FeaturesUtils
+from meanaudio.utils.download_utils import download_model_if_needed
+log = logging.getLogger()
+@dataclasses.dataclass
+class ModelConfig:
+    model_name: str
+    model_path: Path
+    vae_path: Path
+    bigvgan_16k_path: Optional[Path]
+    mode: str
+    @property
+    def seq_cfg(self) -> SequenceConfig:
+        if self.mode == '16k':
+            return CONFIG_16K  # get sequence config when calling cfg.seq_cfgs
+        elif self.mode == '44k':
+            return CONFIG_44K
+    def download_if_needed(self):
+        raise NotImplementedError("Downloading models is not supported")
+        download_model_if_needed(self.model_path)
+        download_model_if_needed(self.vae_path)
+        if self.bigvgan_16k_path is not None:
+            download_model_if_needed(self.bigvgan_16k_path)
+fluxaudio_fm = ModelConfig(model_name='fluxaudio_fm',
+                           model_path=Path('./weights/fluxaudio_fm.pth'),
+                           vae_path=Path('./weights/v1-16.pth'),
+                           bigvgan_16k_path=Path('./weights/best_netG.pt'),
+                           mode='16k')
+meanaudio_mf = ModelConfig(model_name='meanaudio_mf',
+                           model_path=Path('./weights/meanaudio_mf.pth'),
+                           vae_path=Path('./weights/v1-16.pth'),
+                           bigvgan_16k_path=Path('./weights/best_netG.pt'),
+                           mode='16k')
+all_model_cfg: dict[str, ModelConfig] = {
+    'fluxaudio_fm': fluxaudio_fm,
+    'meanaudio_mf': meanaudio_mf,
+}
+def generate_fm(
+    text: Optional[list[str]],
+    *,
+    negative_text: Optional[list[str]] = None,
+    feature_utils: FeaturesUtils,
+    net: FluxAudio,
+    fm: FlowMatching,
+    rng: torch.Generator,
+    cfg_strength: float,
+) -> torch.Tensor:
+    # generate audio with vanilla flow matching
+    device = feature_utils.device
+    dtype = feature_utils.dtype
+    bs = len(text)
+    if text is not None:
+        text_features, text_features_c = feature_utils.encode_text(text)
+    else:
+        text_features, text_features_c = net.get_empty_string_sequence(bs)
+    if negative_text is not None:
+        assert len(negative_text) == bs
+        negative_text_features = feature_utils.encode_text(negative_text)
+    else:
+        negative_text_features = net.get_empty_string_sequence(bs)
+    x0 = torch.randn(bs,
+                     net.latent_seq_len,
+                     net.latent_dim,
+                     device=device,
+                     dtype=dtype,
+                     generator=rng)
+    preprocessed_conditions = net.preprocess_conditions(text_features, text_features_c)
+    empty_conditions = net.get_empty_conditions(
+        bs, negative_text_features=negative_text_features if negative_text is not None else None)
+    cfg_ode_wrapper = lambda t, x: net.ode_wrapper(t, x, preprocessed_conditions, empty_conditions,
+                                                   cfg_strength)
+    x1 = fm.to_data(cfg_ode_wrapper, x0)
+    x1 = net.unnormalize(x1)
+    spec = feature_utils.decode(x1)
+    audio = feature_utils.vocode(spec)
+    return audio
+def generate_mf(
+    text: Optional[list[str]],
+    *,
+    negative_text: Optional[list[str]] = None,
+    feature_utils: FeaturesUtils,
+    net: MeanAudio,
+    mf: MeanFlow,
+    rng: torch.Generator,
+    cfg_strength: float,
+) -> torch.Tensor:
+    # generate audio with mean flow
+    device = feature_utils.device
+    dtype = feature_utils.dtype
+    bs = len(text)
+    if text is not None:
+        text_features, text_features_c = feature_utils.encode_text(text)
+    else:
+        text_features, text_features_c = net.get_empty_string_sequence(bs)
+    if negative_text is not None:
+        assert len(negative_text) == bs
+        negative_text_features = feature_utils.encode_text(negative_text)
+    else:
+        negative_text_features = net.get_empty_string_sequence(bs)
+    x0 = torch.randn(bs,
+                     net.latent_seq_len,
+                     net.latent_dim,
+                     device=device,
+                     dtype=dtype,
+                     generator=rng)
+    preprocessed_conditions = net.preprocess_conditions(text_features, text_features_c)
+    empty_conditions = net.get_empty_conditions(
+        bs, negative_text_features=negative_text_features if negative_text is not None else None)
+    cfg_ode_wrapper = lambda t, r, x: net.ode_wrapper(t, r, x, preprocessed_conditions, empty_conditions,
+                                                      cfg_strength)
+    x1 = mf.to_data(cfg_ode_wrapper, x0)
+    x1 = net.unnormalize(x1)
+    spec = feature_utils.decode(x1)
+    audio = feature_utils.vocode(spec)
+    return audio
+LOGFORMAT = "[%(log_color)s%(levelname)-8s%(reset)s]: %(log_color)s%(message)s%(reset)s"
+def setup_eval_logging(log_level: int = logging.INFO):
+    logging.root.setLevel(log_level) # set up root logger <=> logging.getLogger().setLevel(log_level)
+    formatter = ColoredFormatter(LOGFORMAT)
+    stream = logging.StreamHandler()  # to Console
+    stream.setLevel(log_level)
+    stream.setFormatter(formatter)
+    log = logging.getLogger()
+    log.setLevel(log_level)
+    log.addHandler(stream)

meanaudio/ext/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

meanaudio/ext/autoencoder/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .autoencoder import AutoEncoderModule

meanaudio/ext/autoencoder/autoencoder.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from typing import Literal, Optional
+import torch
+import torch.nn as nn
+from meanaudio.ext.autoencoder.vae import VAE, get_my_vae
+from meanaudio.ext.bigvgan import BigVGAN
+from meanaudio.ext.bigvgan_v2.bigvgan import BigVGAN as BigVGANv2
+from meanaudio.model.utils.distributions import DiagonalGaussianDistribution
+class AutoEncoderModule(nn.Module):
+    def __init__(self,
+                 *,
+                 vae_ckpt_path,
+                 vocoder_ckpt_path: Optional[str] = None,
+                 mode: Literal['16k', '44k'],
+                 need_vae_encoder: bool = True):
+        super().__init__()
+        self.vae: VAE = get_my_vae(mode).eval()
+        vae_state_dict = torch.load(vae_ckpt_path, weights_only=True, map_location='cpu')
+        self.vae.load_state_dict(vae_state_dict)
+        self.vae.remove_weight_norm()
+        if mode == '16k':
+            assert vocoder_ckpt_path is not None
+            self.vocoder = BigVGAN(vocoder_ckpt_path).eval()
+        elif mode == '44k':
+            self.vocoder = BigVGANv2.from_pretrained('nvidia/bigvgan_v2_44khz_128band_512x',
+                                                     use_cuda_kernel=False)
+            self.vocoder.remove_weight_norm()
+        else:
+            raise ValueError(f'Unknown mode: {mode}')
+        for param in self.parameters():
+            param.requires_grad = False
+        if not need_vae_encoder:
+            del self.vae.encoder
+    @torch.inference_mode()
+    def encode(self, x: torch.Tensor) -> DiagonalGaussianDistribution:
+        return self.vae.encode(x)
+    @torch.inference_mode()
+    def decode(self, z: torch.Tensor) -> torch.Tensor:
+        return self.vae.decode(z)
+    @torch.inference_mode()
+    def vocode(self, spec: torch.Tensor) -> torch.Tensor:
+        return self.vocoder(spec)

meanaudio/ext/autoencoder/edm2_utils.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# This work is licensed under a Creative Commons
+# Attribution-NonCommercial-ShareAlike 4.0 International License.
+# You should have received a copy of the license along with this
+# work. If not, see http://creativecommons.org/licenses/by-nc-sa/4.0/
+"""Improved diffusion model architecture proposed in the paper
+"Analyzing and Improving the Training Dynamics of Diffusion Models"."""
+import numpy as np
+import torch
+#----------------------------------------------------------------------------
+# Variant of constant() that inherits dtype and device from the given
+# reference tensor by default.
+_constant_cache = dict()
+def constant(value, shape=None, dtype=None, device=None, memory_format=None):
+    value = np.asarray(value)
+    if shape is not None:
+        shape = tuple(shape)
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if device is None:
+        device = torch.device('cpu')
+    if memory_format is None:
+        memory_format = torch.contiguous_format
+    key = (value.shape, value.dtype, value.tobytes(), shape, dtype, device, memory_format)
+    tensor = _constant_cache.get(key, None)
+    if tensor is None:
+        tensor = torch.as_tensor(value.copy(), dtype=dtype, device=device)
+        if shape is not None:
+            tensor, _ = torch.broadcast_tensors(tensor, torch.empty(shape))
+        tensor = tensor.contiguous(memory_format=memory_format)
+        _constant_cache[key] = tensor
+    return tensor
+def const_like(ref, value, shape=None, dtype=None, device=None, memory_format=None):
+    if dtype is None:
+        dtype = ref.dtype
+    if device is None:
+        device = ref.device
+    return constant(value, shape=shape, dtype=dtype, device=device, memory_format=memory_format)
+#----------------------------------------------------------------------------
+# Normalize given tensor to unit magnitude with respect to the given
+# dimensions. Default = all dimensions except the first.
+def normalize(x, dim=None, eps=1e-4):
+    if dim is None:
+        dim = list(range(1, x.ndim))
+    norm = torch.linalg.vector_norm(x, dim=dim, keepdim=True, dtype=torch.float32)
+    norm = torch.add(eps, norm, alpha=np.sqrt(norm.numel() / x.numel()))
+    return x / norm.to(x.dtype)
+class Normalize(torch.nn.Module):
+    def __init__(self, dim=None, eps=1e-4):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+    def forward(self, x):
+        return normalize(x, dim=self.dim, eps=self.eps)
+#----------------------------------------------------------------------------
+# Upsample or downsample the given tensor with the given filter,
+# or keep it as is.
+def resample(x, f=[1, 1], mode='keep'):
+    if mode == 'keep':
+        return x
+    f = np.float32(f)
+    assert f.ndim == 1 and len(f) % 2 == 0
+    pad = (len(f) - 1) // 2
+    f = f / f.sum()
+    f = np.outer(f, f)[np.newaxis, np.newaxis, :, :]
+    f = const_like(x, f)
+    c = x.shape[1]
+    if mode == 'down':
+        return torch.nn.functional.conv2d(x,
+                                          f.tile([c, 1, 1, 1]),
+                                          groups=c,
+                                          stride=2,
+                                          padding=(pad, ))
+    assert mode == 'up'
+    return torch.nn.functional.conv_transpose2d(x, (f * 4).tile([c, 1, 1, 1]),
+                                                groups=c,
+                                                stride=2,
+                                                padding=(pad, ))
+#----------------------------------------------------------------------------
+# Magnitude-preserving SiLU (Equation 81).
+def mp_silu(x):
+    return torch.nn.functional.silu(x) / 0.596
+class MPSiLU(torch.nn.Module):
+    def forward(self, x):
+        return mp_silu(x)
+#----------------------------------------------------------------------------
+# Magnitude-preserving sum (Equation 88).
+def mp_sum(a, b, t=0.5):
+    return a.lerp(b, t) / np.sqrt((1 - t)**2 + t**2)
+#----------------------------------------------------------------------------
+# Magnitude-preserving concatenation (Equation 103).
+def mp_cat(a, b, dim=1, t=0.5):
+    Na = a.shape[dim]
+    Nb = b.shape[dim]
+    C = np.sqrt((Na + Nb) / ((1 - t)**2 + t**2))
+    wa = C / np.sqrt(Na) * (1 - t)
+    wb = C / np.sqrt(Nb) * t
+    return torch.cat([wa * a, wb * b], dim=dim)
+#----------------------------------------------------------------------------
+# Magnitude-preserving convolution or fully-connected layer (Equation 47)
+# with force weight normalization (Equation 66).
+class MPConv1D(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size):
+        super().__init__()
+        self.out_channels = out_channels
+        self.weight = torch.nn.Parameter(torch.randn(out_channels, in_channels, kernel_size))
+        self.weight_norm_removed = False
+    def forward(self, x, gain=1):
+        assert self.weight_norm_removed, 'call remove_weight_norm() before inference'
+        w = self.weight * gain
+        if w.ndim == 2:
+            return x @ w.t()
+        assert w.ndim == 3
+        return torch.nn.functional.conv1d(x, w, padding=(w.shape[-1] // 2, ))
+    def remove_weight_norm(self):
+        w = self.weight.to(torch.float32)
+        w = normalize(w)  # traditional weight normalization
+        w = w / np.sqrt(w[0].numel())
+        w = w.to(self.weight.dtype)
+        self.weight.data.copy_(w)
+        self.weight_norm_removed = True
+        return self

meanaudio/ext/autoencoder/vae.py ADDED Viewed

	@@ -0,0 +1,369 @@

+import logging
+from typing import Optional
+import torch
+import torch.nn as nn
+from meanaudio.ext.autoencoder.edm2_utils import MPConv1D
+from meanaudio.ext.autoencoder.vae_modules import (AttnBlock1D, Downsample1D, ResnetBlock1D,
+                                                 Upsample1D, nonlinearity)
+from meanaudio.model.utils.distributions import DiagonalGaussianDistribution
+log = logging.getLogger()
+DATA_MEAN_80D = [
+    -1.6058, -1.3676, -1.2520, -1.2453, -1.2078, -1.2224, -1.2419, -1.2439, -1.2922, -1.2927,
+    -1.3170, -1.3543, -1.3401, -1.3836, -1.3907, -1.3912, -1.4313, -1.4152, -1.4527, -1.4728,
+    -1.4568, -1.5101, -1.5051, -1.5172, -1.5623, -1.5373, -1.5746, -1.5687, -1.6032, -1.6131,
+    -1.6081, -1.6331, -1.6489, -1.6489, -1.6700, -1.6738, -1.6953, -1.6969, -1.7048, -1.7280,
+    -1.7361, -1.7495, -1.7658, -1.7814, -1.7889, -1.8064, -1.8221, -1.8377, -1.8417, -1.8643,
+    -1.8857, -1.8929, -1.9173, -1.9379, -1.9531, -1.9673, -1.9824, -2.0042, -2.0215, -2.0436,
+    -2.0766, -2.1064, -2.1418, -2.1855, -2.2319, -2.2767, -2.3161, -2.3572, -2.3954, -2.4282,
+    -2.4659, -2.5072, -2.5552, -2.6074, -2.6584, -2.7107, -2.7634, -2.8266, -2.8981, -2.9673
+]
+DATA_STD_80D = [
+    1.0291, 1.0411, 1.0043, 0.9820, 0.9677, 0.9543, 0.9450, 0.9392, 0.9343, 0.9297, 0.9276, 0.9263,
+    0.9242, 0.9254, 0.9232, 0.9281, 0.9263, 0.9315, 0.9274, 0.9247, 0.9277, 0.9199, 0.9188, 0.9194,
+    0.9160, 0.9161, 0.9146, 0.9161, 0.9100, 0.9095, 0.9145, 0.9076, 0.9066, 0.9095, 0.9032, 0.9043,
+    0.9038, 0.9011, 0.9019, 0.9010, 0.8984, 0.8983, 0.8986, 0.8961, 0.8962, 0.8978, 0.8962, 0.8973,
+    0.8993, 0.8976, 0.8995, 0.9016, 0.8982, 0.8972, 0.8974, 0.8949, 0.8940, 0.8947, 0.8936, 0.8939,
+    0.8951, 0.8956, 0.9017, 0.9167, 0.9436, 0.9690, 1.0003, 1.0225, 1.0381, 1.0491, 1.0545, 1.0604,
+    1.0761, 1.0929, 1.1089, 1.1196, 1.1176, 1.1156, 1.1117, 1.1070
+]
+DATA_MEAN_128D = [
+    -3.3462, -2.6723, -2.4893, -2.3143, -2.2664, -2.3317, -2.1802, -2.4006, -2.2357, -2.4597,
+    -2.3717, -2.4690, -2.5142, -2.4919, -2.6610, -2.5047, -2.7483, -2.5926, -2.7462, -2.7033,
+    -2.7386, -2.8112, -2.7502, -2.9594, -2.7473, -3.0035, -2.8891, -2.9922, -2.9856, -3.0157,
+    -3.1191, -2.9893, -3.1718, -3.0745, -3.1879, -3.2310, -3.1424, -3.2296, -3.2791, -3.2782,
+    -3.2756, -3.3134, -3.3509, -3.3750, -3.3951, -3.3698, -3.4505, -3.4509, -3.5089, -3.4647,
+    -3.5536, -3.5788, -3.5867, -3.6036, -3.6400, -3.6747, -3.7072, -3.7279, -3.7283, -3.7795,
+    -3.8259, -3.8447, -3.8663, -3.9182, -3.9605, -3.9861, -4.0105, -4.0373, -4.0762, -4.1121,
+    -4.1488, -4.1874, -4.2461, -4.3170, -4.3639, -4.4452, -4.5282, -4.6297, -4.7019, -4.7960,
+    -4.8700, -4.9507, -5.0303, -5.0866, -5.1634, -5.2342, -5.3242, -5.4053, -5.4927, -5.5712,
+    -5.6464, -5.7052, -5.7619, -5.8410, -5.9188, -6.0103, -6.0955, -6.1673, -6.2362, -6.3120,
+    -6.3926, -6.4797, -6.5565, -6.6511, -6.8130, -6.9961, -7.1275, -7.2457, -7.3576, -7.4663,
+    -7.6136, -7.7469, -7.8815, -8.0132, -8.1515, -8.3071, -8.4722, -8.7418, -9.3975, -9.6628,
+    -9.7671, -9.8863, -9.9992, -10.0860, -10.1709, -10.5418, -11.2795, -11.3861
+]
+DATA_STD_128D = [
+    2.3804, 2.4368, 2.3772, 2.3145, 2.2803, 2.2510, 2.2316, 2.2083, 2.1996, 2.1835, 2.1769, 2.1659,
+    2.1631, 2.1618, 2.1540, 2.1606, 2.1571, 2.1567, 2.1612, 2.1579, 2.1679, 2.1683, 2.1634, 2.1557,
+    2.1668, 2.1518, 2.1415, 2.1449, 2.1406, 2.1350, 2.1313, 2.1415, 2.1281, 2.1352, 2.1219, 2.1182,
+    2.1327, 2.1195, 2.1137, 2.1080, 2.1179, 2.1036, 2.1087, 2.1036, 2.1015, 2.1068, 2.0975, 2.0991,
+    2.0902, 2.1015, 2.0857, 2.0920, 2.0893, 2.0897, 2.0910, 2.0881, 2.0925, 2.0873, 2.0960, 2.0900,
+    2.0957, 2.0958, 2.0978, 2.0936, 2.0886, 2.0905, 2.0845, 2.0855, 2.0796, 2.0840, 2.0813, 2.0817,
+    2.0838, 2.0840, 2.0917, 2.1061, 2.1431, 2.1976, 2.2482, 2.3055, 2.3700, 2.4088, 2.4372, 2.4609,
+    2.4731, 2.4847, 2.5072, 2.5451, 2.5772, 2.6147, 2.6529, 2.6596, 2.6645, 2.6726, 2.6803, 2.6812,
+    2.6899, 2.6916, 2.6931, 2.6998, 2.7062, 2.7262, 2.7222, 2.7158, 2.7041, 2.7485, 2.7491, 2.7451,
+    2.7485, 2.7233, 2.7297, 2.7233, 2.7145, 2.6958, 2.6788, 2.6439, 2.6007, 2.4786, 2.2469, 2.1877,
+    2.1392, 2.0717, 2.0107, 1.9676, 1.9140, 1.7102, 0.9101, 0.7164
+]
+class VAE(nn.Module):
+    def __init__(
+        self,
+        *,
+        data_dim: int,
+        embed_dim: int,
+        hidden_dim: int,
+    ):
+        super().__init__()
+        if data_dim == 80:
+            self.data_mean = nn.Buffer(torch.tensor(DATA_MEAN_80D, dtype=torch.float32))
+            self.data_std = nn.Buffer(torch.tensor(DATA_STD_80D, dtype=torch.float32))
+        elif data_dim == 128:
+            self.data_mean = nn.Buffer(torch.tensor(DATA_MEAN_128D, dtype=torch.float32))
+            self.data_std = nn.Buffer(torch.tensor(DATA_STD_128D, dtype=torch.float32))
+        self.data_mean = self.data_mean.view(1, -1, 1)
+        self.data_std = self.data_std.view(1, -1, 1)
+        self.encoder = Encoder1D(
+            dim=hidden_dim,
+            ch_mult=(1, 2, 4),
+            num_res_blocks=2,
+            attn_layers=[3],
+            down_layers=[0],
+            in_dim=data_dim,
+            embed_dim=embed_dim,
+        )
+        self.decoder = Decoder1D(
+            dim=hidden_dim,
+            ch_mult=(1, 2, 4),
+            num_res_blocks=2,
+            attn_layers=[3],
+            down_layers=[0],
+            in_dim=data_dim,
+            out_dim=data_dim,
+            embed_dim=embed_dim,
+        )
+        self.embed_dim = embed_dim
+        # self.quant_conv = nn.Conv1d(2 * embed_dim, 2 * embed_dim, 1)
+        # self.post_quant_conv = nn.Conv1d(embed_dim, embed_dim, 1)
+        self.initialize_weights()
+    def initialize_weights(self):
+        pass
+    def encode(self, x: torch.Tensor, normalize: bool = True) -> DiagonalGaussianDistribution:
+        if normalize:
+            x = self.normalize(x)
+        moments = self.encoder(x)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+    def decode(self, z: torch.Tensor, unnormalize: bool = True) -> torch.Tensor:
+        dec = self.decoder(z)
+        if unnormalize:
+            dec = self.unnormalize(dec)
+        return dec
+    def normalize(self, x: torch.Tensor) -> torch.Tensor:
+        return (x - self.data_mean) / self.data_std
+    def unnormalize(self, x: torch.Tensor) -> torch.Tensor:
+        return x * self.data_std + self.data_mean
+    def forward(
+        self,
+        x: torch.Tensor,
+        sample_posterior: bool = True,
+        rng: Optional[torch.Generator] = None,
+        normalize: bool = True,
+        unnormalize: bool = True,
+    ) -> tuple[torch.Tensor, DiagonalGaussianDistribution]:
+        posterior = self.encode(x, normalize=normalize)
+        if sample_posterior:
+            z = posterior.sample(rng)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z, unnormalize=unnormalize)
+        return dec, posterior
+    def load_weights(self, src_dict) -> None:
+        self.load_state_dict(src_dict, strict=True)
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    def remove_weight_norm(self):
+        for name, m in self.named_modules():
+            if isinstance(m, MPConv1D):
+                m.remove_weight_norm()
+                log.debug(f"Removed weight norm from {name}")
+        return self
+class Encoder1D(nn.Module):
+    def __init__(self,
+                 *,
+                 dim: int,
+                 ch_mult: tuple[int] = (1, 2, 4, 8),
+                 num_res_blocks: int,
+                 attn_layers: list[int] = [],
+                 down_layers: list[int] = [],
+                 resamp_with_conv: bool = True,
+                 in_dim: int,
+                 embed_dim: int,
+                 double_z: bool = True,
+                 kernel_size: int = 3,
+                 clip_act: float = 256.0):
+        super().__init__()
+        self.dim = dim
+        self.num_layers = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.in_channels = in_dim
+        self.clip_act = clip_act
+        self.down_layers = down_layers
+        self.attn_layers = attn_layers
+        self.conv_in = MPConv1D(in_dim, self.dim, kernel_size=kernel_size)
+        in_ch_mult = (1, ) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        # downsampling
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_layers):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = dim * in_ch_mult[i_level]
+            block_out = dim * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock1D(in_dim=block_in,
+                                  out_dim=block_out,
+                                  kernel_size=kernel_size,
+                                  use_norm=True))
+                block_in = block_out
+                if i_level in attn_layers:
+                    attn.append(AttnBlock1D(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level in down_layers:
+                down.downsample = Downsample1D(block_in, resamp_with_conv)
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock1D(in_dim=block_in,
+                                         out_dim=block_in,
+                                         kernel_size=kernel_size,
+                                         use_norm=True)
+        self.mid.attn_1 = AttnBlock1D(block_in)
+        self.mid.block_2 = ResnetBlock1D(in_dim=block_in,
+                                         out_dim=block_in,
+                                         kernel_size=kernel_size,
+                                         use_norm=True)
+        # end
+        self.conv_out = MPConv1D(block_in,
+                                 2 * embed_dim if double_z else embed_dim,
+                                 kernel_size=kernel_size)
+        self.learnable_gain = nn.Parameter(torch.zeros([]))
+    def forward(self, x):
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_layers):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                h = h.clamp(-self.clip_act, self.clip_act)
+                hs.append(h)
+            if i_level in self.down_layers:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        h = h.clamp(-self.clip_act, self.clip_act)
+        # end
+        h = nonlinearity(h)
+        h = self.conv_out(h, gain=(self.learnable_gain + 1))
+        return h
+class Decoder1D(nn.Module):
+    def __init__(self,
+                 *,
+                 dim: int,
+                 out_dim: int,
+                 ch_mult: tuple[int] = (1, 2, 4, 8),
+                 num_res_blocks: int,
+                 attn_layers: list[int] = [],
+                 down_layers: list[int] = [],
+                 kernel_size: int = 3,
+                 resamp_with_conv: bool = True,
+                 in_dim: int,
+                 embed_dim: int,
+                 clip_act: float = 256.0):
+        super().__init__()
+        self.ch = dim
+        self.num_layers = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.in_channels = in_dim
+        self.clip_act = clip_act
+        self.down_layers = [i + 1 for i in down_layers]  # each downlayer add one
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = dim * ch_mult[self.num_layers - 1]
+        # z to block_in
+        self.conv_in = MPConv1D(embed_dim, block_in, kernel_size=kernel_size)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock1D(in_dim=block_in, out_dim=block_in, use_norm=True)
+        self.mid.attn_1 = AttnBlock1D(block_in)
+        self.mid.block_2 = ResnetBlock1D(in_dim=block_in, out_dim=block_in, use_norm=True)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_layers)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = dim * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock1D(in_dim=block_in, out_dim=block_out, use_norm=True))
+                block_in = block_out
+                if i_level in attn_layers:
+                    attn.append(AttnBlock1D(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level in self.down_layers:
+                up.upsample = Upsample1D(block_in, resamp_with_conv)
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.conv_out = MPConv1D(block_in, out_dim, kernel_size=kernel_size)
+        self.learnable_gain = nn.Parameter(torch.zeros([]))
+    def forward(self, z):
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        h = h.clamp(-self.clip_act, self.clip_act)
+        # upsampling
+        for i_level in reversed(range(self.num_layers)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+                h = h.clamp(-self.clip_act, self.clip_act)
+            if i_level in self.down_layers:
+                h = self.up[i_level].upsample(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h, gain=(self.learnable_gain + 1))
+        return h
+def VAE_16k(**kwargs) -> VAE:
+    return VAE(data_dim=80, embed_dim=20, hidden_dim=384, **kwargs)
+def VAE_44k(**kwargs) -> VAE:
+    return VAE(data_dim=128, embed_dim=40, hidden_dim=512, **kwargs)
+def get_my_vae(name: str, **kwargs) -> VAE:
+    if name == '16k':
+        return VAE_16k(**kwargs)
+    if name == '44k':
+        return VAE_44k(**kwargs)
+    raise ValueError(f'Unknown model: {name}')
+if __name__ == '__main__':
+    network = get_my_vae('standard')
+    # print the number of parameters in terms of millions
+    num_params = sum(p.numel() for p in network.parameters()) / 1e6
+    print(f'Number of parameters: {num_params:.2f}M')

meanaudio/ext/autoencoder/vae_modules.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from meanaudio.ext.autoencoder.edm2_utils import (MPConv1D, mp_silu, mp_sum, normalize)
+def nonlinearity(x):
+    # swish
+    return mp_silu(x)
+class ResnetBlock1D(nn.Module):
+    def __init__(self, *, in_dim, out_dim=None, conv_shortcut=False, kernel_size=3, use_norm=True):
+        super().__init__()
+        self.in_dim = in_dim
+        out_dim = in_dim if out_dim is None else out_dim
+        self.out_dim = out_dim
+        self.use_conv_shortcut = conv_shortcut
+        self.use_norm = use_norm
+        self.conv1 = MPConv1D(in_dim, out_dim, kernel_size=kernel_size)
+        self.conv2 = MPConv1D(out_dim, out_dim, kernel_size=kernel_size)
+        if self.in_dim != self.out_dim:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = MPConv1D(in_dim, out_dim, kernel_size=kernel_size)
+            else:
+                self.nin_shortcut = MPConv1D(in_dim, out_dim, kernel_size=1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # pixel norm
+        if self.use_norm:
+            x = normalize(x, dim=1)
+        h = x
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = nonlinearity(h)
+        h = self.conv2(h)
+        if self.in_dim != self.out_dim:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return mp_sum(x, h, t=0.3)
+class AttnBlock1D(nn.Module):
+    def __init__(self, in_channels, num_heads=1):
+        super().__init__()
+        self.in_channels = in_channels
+        self.num_heads = num_heads
+        self.qkv = MPConv1D(in_channels, in_channels * 3, kernel_size=1)
+        self.proj_out = MPConv1D(in_channels, in_channels, kernel_size=1)
+    def forward(self, x):
+        h = x
+        y = self.qkv(h)
+        y = y.reshape(y.shape[0], self.num_heads, -1, 3, y.shape[-1])
+        q, k, v = normalize(y, dim=2).unbind(3)
+        q = rearrange(q, 'b h c l -> b h l c')
+        k = rearrange(k, 'b h c l -> b h l c')
+        v = rearrange(v, 'b h c l -> b h l c')
+        h = F.scaled_dot_product_attention(q, k, v)
+        h = rearrange(h, 'b h l c -> b (h c) l')
+        h = self.proj_out(h)
+        return mp_sum(x, h, t=0.3)
+class Upsample1D(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = MPConv1D(in_channels, in_channels, kernel_size=3)
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=2.0, mode='nearest-exact')  # support 3D tensor(B,C,T)
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample1D(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv1 = MPConv1D(in_channels, in_channels, kernel_size=1)
+            self.conv2 = MPConv1D(in_channels, in_channels, kernel_size=1)
+    def forward(self, x):
+        if self.with_conv:
+            x = self.conv1(x)
+        x = F.avg_pool1d(x, kernel_size=2, stride=2)
+        if self.with_conv:
+            x = self.conv2(x)
+        return x

meanaudio/ext/bigvgan/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 NVIDIA CORPORATION.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

meanaudio/ext/bigvgan/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .bigvgan import BigVGAN

meanaudio/ext/bigvgan/activations.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
+#   LICENSE is in incl_licenses directory.
+import torch
+from torch import nn, sin, pow
+from torch.nn import Parameter
+class Snake(nn.Module):
+    '''
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(Snake, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale: # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else: # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x
+class SnakeBeta(nn.Module):
+    '''
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale: # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else: # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x

meanaudio/ext/bigvgan/alias_free_torch/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+from .filter import *
+from .resample import *
+from .act import *

meanaudio/ext/bigvgan/alias_free_torch/act.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+import torch.nn as nn
+from .resample import UpSample1d, DownSample1d
+class Activation1d(nn.Module):
+    def __init__(self,
+                 activation,
+                 up_ratio: int = 2,
+                 down_ratio: int = 2,
+                 up_kernel_size: int = 12,
+                 down_kernel_size: int = 12):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+    # x: [B,C,T]
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+        return x

meanaudio/ext/bigvgan/alias_free_torch/filter.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+if 'sinc' in dir(torch):
+    sinc = torch.sinc
+else:
+    # This code is adopted from adefossez's julius.core.sinc under the MIT License
+    # https://adefossez.github.io/julius/julius/core.html
+    #   LICENSE is in incl_licenses directory.
+    def sinc(x: torch.Tensor):
+        """
+        Implementation of sinc, i.e. sin(pi * x) / (pi * x)
+        __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
+        """
+        return torch.where(x == 0,
+                           torch.tensor(1., device=x.device, dtype=x.dtype),
+                           torch.sin(math.pi * x) / math.pi / x)
+# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
+# https://adefossez.github.io/julius/julius/lowpass.html
+#   LICENSE is in incl_licenses directory.
+def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
+    even = (kernel_size % 2 == 0)
+    half_size = kernel_size // 2
+    #For kaiser window
+    delta_f = 4 * half_width
+    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if A > 50.:
+        beta = 0.1102 * (A - 8.7)
+    elif A >= 21.:
+        beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.)
+    else:
+        beta = 0.
+    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
+    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
+    if even:
+        time = (torch.arange(-half_size, half_size) + 0.5)
+    else:
+        time = torch.arange(kernel_size) - half_size
+    if cutoff == 0:
+        filter_ = torch.zeros_like(time)
+    else:
+        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
+        # Normalize filter to have sum = 1, otherwise we will have a small leakage
+        # of the constant component in the input signal.
+        filter_ /= filter_.sum()
+        filter = filter_.view(1, 1, kernel_size)
+    return filter
+class LowPassFilter1d(nn.Module):
+    def __init__(self,
+                 cutoff=0.5,
+                 half_width=0.6,
+                 stride: int = 1,
+                 padding: bool = True,
+                 padding_mode: str = 'replicate',
+                 kernel_size: int = 12):
+        # kernel_size should be even number for stylegan3 setup,
+        # in this implementation, odd number is also possible.
+        super().__init__()
+        if cutoff < -0.:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.kernel_size = kernel_size
+        self.even = (kernel_size % 2 == 0)
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.stride = stride
+        self.padding = padding
+        self.padding_mode = padding_mode
+        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
+        self.register_buffer("filter", filter)
+    #input [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+        if self.padding:
+            x = F.pad(x, (self.pad_left, self.pad_right),
+                      mode=self.padding_mode)
+        out = F.conv1d(x, self.filter.expand(C, -1, -1),
+                       stride=self.stride, groups=C)
+        return out

meanaudio/ext/bigvgan/alias_free_torch/resample.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+import torch.nn as nn
+from torch.nn import functional as F
+from .filter import LowPassFilter1d
+from .filter import kaiser_sinc_filter1d
+class UpSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.stride = ratio
+        self.pad = self.kernel_size // ratio - 1
+        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+        self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+        filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio,
+                                      half_width=0.6 / ratio,
+                                      kernel_size=self.kernel_size)
+        self.register_buffer("filter", filter)
+    # x: [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+        x = F.pad(x, (self.pad, self.pad), mode='replicate')
+        x = self.ratio * F.conv_transpose1d(
+            x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
+        x = x[..., self.pad_left:-self.pad_right]
+        return x
+class DownSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio,
+                                       half_width=0.6 / ratio,
+                                       stride=ratio,
+                                       kernel_size=self.kernel_size)
+    def forward(self, x):
+        xx = self.lowpass(x)
+        return xx

meanaudio/ext/bigvgan/bigvgan.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from pathlib import Path
+import torch
+import torch.nn as nn
+from omegaconf import OmegaConf
+from meanaudio.ext.bigvgan.models import BigVGANVocoder
+_bigvgan_vocoder_path = Path(__file__).parent / 'bigvgan_vocoder.yml'
+class BigVGAN(nn.Module):
+    def __init__(self, ckpt_path, config_path=_bigvgan_vocoder_path):
+        super().__init__()
+        vocoder_cfg = OmegaConf.load(config_path)
+        self.vocoder = BigVGANVocoder(vocoder_cfg).eval()
+        vocoder_ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=True)['generator']
+        self.vocoder.load_state_dict(vocoder_ckpt)
+        self.weight_norm_removed = False
+        self.remove_weight_norm()
+    @torch.inference_mode()
+    def forward(self, x):
+        assert self.weight_norm_removed, 'call remove_weight_norm() before inference'
+        return self.vocoder(x)
+    def remove_weight_norm(self):
+        self.vocoder.remove_weight_norm()
+        self.weight_norm_removed = True
+        return self

meanaudio/ext/bigvgan/bigvgan_vocoder.yml ADDED Viewed

	@@ -0,0 +1,63 @@

+resblock: '1'
+num_gpus: 0
+batch_size: 64
+num_mels: 80
+learning_rate: 0.0001
+adam_b1: 0.8
+adam_b2: 0.99
+lr_decay: 0.999
+seed: 1234
+upsample_rates:
+- 4
+- 4
+- 2
+- 2
+- 2
+- 2
+upsample_kernel_sizes:
+- 8
+- 8
+- 4
+- 4
+- 4
+- 4
+upsample_initial_channel: 1536
+resblock_kernel_sizes:
+- 3
+- 7
+- 11
+resblock_dilation_sizes:
+- - 1
+  - 3
+  - 5
+- - 1
+  - 3
+  - 5
+- - 1
+  - 3
+  - 5
+activation: snakebeta
+snake_logscale: true
+resolutions:
+- - 1024
+  - 120
+  - 600
+- - 2048
+  - 240
+  - 1200
+- - 512
+  - 50
+  - 240
+mpd_reshapes:
+- 2
+- 3
+- 5
+- 7
+- 11
+use_spectral_norm: false
+discriminator_channel_mult: 1
+num_workers: 4
+dist_config:
+  dist_backend: nccl
+  dist_url: tcp://localhost:54341
+  world_size: 1

meanaudio/ext/bigvgan/env.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+import os
+import shutil
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))

meanaudio/ext/bigvgan/incl_licenses/LICENSE_1 ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2020 Jungil Kong
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

meanaudio/ext/bigvgan/incl_licenses/LICENSE_2 ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2020 Edward Dixon
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

meanaudio/ext/bigvgan/incl_licenses/LICENSE_3 ADDED Viewed

	@@ -0,0 +1,201 @@

+                                Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

meanaudio/ext/bigvgan/incl_licenses/LICENSE_4 ADDED Viewed

	@@ -0,0 +1,29 @@

+BSD 3-Clause License
+Copyright (c) 2019, Seungwon Park 박승원
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.