Spaces:

LPDoctor
/

ThinkSound-Audio-App

Running on Zero

App Files Files Community

LPDoctor commited on 2 days ago

Commit

18db7f4

1 Parent(s): f94464b

Implement core functionality for ThinkSound audio generation app, including video processing, audio synthesis, and Gradio interface setup. Update README with new title and emoji.

Browse files

Files changed (41) hide show

README.md +5 -7
ThinkSound +1 -0
app.py +340 -4
cot_vgg_demo_caption.txt +1 -0
data_utils/ext/synchformer/LICENSE +21 -0
data_utils/ext/synchformer/__init__.py +1 -0
data_utils/ext/synchformer/divided_224_16x4.yaml +84 -0
data_utils/ext/synchformer/motionformer.py +400 -0
data_utils/ext/synchformer/synchformer.py +55 -0
data_utils/ext/synchformer/utils.py +92 -0
data_utils/ext/synchformer/video_model_builder.py +277 -0
data_utils/ext/synchformer/vit_helper.py +399 -0
data_utils/v2a_utils/__init__.py +0 -0
data_utils/v2a_utils/audio_text_dataset.py +173 -0
data_utils/v2a_utils/audioset_224.py +315 -0
data_utils/v2a_utils/audioset_video_224.py +268 -0
data_utils/v2a_utils/feature_utils_224.py +182 -0
data_utils/v2a_utils/vggsound.py +259 -0
data_utils/v2a_utils/vggsound_224.py +320 -0
data_utils/v2a_utils/vggsound_224_no_audio.py +275 -0
data_utils/v2a_utils/vggsound_224_no_sync.py +223 -0
data_utils/v2a_utils/vggsound_text.py +109 -0
defaults.ini +68 -0
demo_test.csv +17 -0
examples/1.mp4 +3 -0
examples/1_mute.mp4 +3 -0
examples/2.mp4 +3 -0
examples/2_mute.mp4 +3 -0
examples/3.mp4 +3 -0
examples/3_mute.mp4 +3 -0
examples/4.mp4 +3 -0
examples/4_mute.mp4 +3 -0
examples/5.mp4 +3 -0
examples/5_mute.mp4 +3 -0
extract_latents.py +128 -0
predict.py +214 -0
pyproject.toml +3 -0
requirements.txt +254 -0
scripts/demo.sh +82 -0
scripts/infer.sh +76 -0
setup.py +44 -0

README.md CHANGED Viewed

@@ -1,13 +1,11 @@
 ---
-title: ThinkSound Audio App
-emoji: 👀
-colorFrom: pink
 colorTo: indigo
 sdk: gradio
-sdk_version: 5.36.2
 app_file: app.py
 pinned: false
 license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: ThinkSound
+emoji: 🔊
+colorFrom: blue
 colorTo: indigo
 sdk: gradio
+sdk_version: 5.35.0
 app_file: app.py
 pinned: false
 license: apache-2.0
+---

ThinkSound ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 9eeb443af2ab75afc046e27494c522dfd87fa2c1

app.py CHANGED Viewed

@@ -1,7 +1,343 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+from prefigure.prefigure import get_all_args, push_wandb_config
+import spaces
+import json
+import os
+os.environ["GRADIO_TEMP_DIR"] = "./.gradio_tmp"
+import re
+import torch
+import torchaudio
+# import pytorch_lightning as pl
+import lightning as L
+from lightning.pytorch.callbacks import Timer, ModelCheckpoint, BasePredictionWriter
+from lightning.pytorch.callbacks import Callback
+from lightning.pytorch.tuner import Tuner
+from lightning.pytorch import seed_everything
+import random
+from datetime import datetime
+from ThinkSound.data.datamodule import DataModule
+from ThinkSound.models import create_model_from_config
+from ThinkSound.models.utils import load_ckpt_state_dict, remove_weight_norm_from_model
+from ThinkSound.training import create_training_wrapper_from_config, create_demo_callback_from_config
+from ThinkSound.training.utils import copy_state_dict
+from ThinkSound.inference.sampling import get_alphas_sigmas, sample, sample_discrete_euler
+from data_utils.v2a_utils.feature_utils_224 import FeaturesUtils
+from torch.utils.data import Dataset
+from typing import Optional, Union
+from torchvision.transforms import v2
+from torio.io import StreamingMediaDecoder
+from torchvision.utils import save_image
+from transformers import AutoProcessor
+import torch.nn.functional as F
 import gradio as gr
+import tempfile
+import subprocess
+from huggingface_hub import hf_hub_download
+from moviepy.editor import VideoFileClip
+# os.system("conda install -c conda-forge 'ffmpeg<7'")
+_CLIP_SIZE = 224
+_CLIP_FPS = 8.0
+_SYNC_SIZE = 224
+_SYNC_FPS = 25.0
+def pad_to_square(video_tensor):
+    if len(video_tensor.shape) != 4:
+        raise ValueError("Input tensor must have shape (l, c, h, w)")
+    l, c, h, w = video_tensor.shape
+    max_side = max(h, w)
+    pad_h = max_side - h
+    pad_w = max_side - w
+    padding = (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2)
+    video_padded = F.pad(video_tensor, pad=padding, mode='constant', value=0)
+    return video_padded
+class VGGSound(Dataset):
+    def __init__(
+        self,
+        sample_rate: int = 44_100,
+        duration_sec: float = 9.0,
+        audio_samples: int = None,
+        normalize_audio: bool = False,
+    ):
+        if audio_samples is None:
+            self.audio_samples = int(sample_rate * duration_sec)
+        else:
+            self.audio_samples = audio_samples
+            effective_duration = audio_samples / sample_rate
+            # make sure the duration is close enough, within 15ms
+            assert abs(effective_duration - duration_sec) < 0.015, \
+                f'audio_samples {audio_samples} does not match duration_sec {duration_sec}'
+        self.sample_rate = sample_rate
+        self.duration_sec = duration_sec
+        self.expected_audio_length = self.audio_samples
+        self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
+        self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
+        self.clip_transform = v2.Compose([
+            v2.Lambda(pad_to_square),          # 先填充为正方形
+            v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+        ])
+        self.clip_processor = AutoProcessor.from_pretrained("facebook/metaclip-h14-fullcc2.5b")
+        self.sync_transform = v2.Compose([
+            v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
+            v2.CenterCrop(_SYNC_SIZE),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+        self.resampler = {}
+    def sample(self, video_path,label,cot):
+        video_id = video_path
+        reader = StreamingMediaDecoder(video_path)
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
+            frame_rate=_CLIP_FPS,
+            format='rgb24',
+        )
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
+            frame_rate=_SYNC_FPS,
+            format='rgb24',
+        )
+        reader.fill_buffer()
+        data_chunk = reader.pop_chunks()
+        clip_chunk = data_chunk[0]
+        sync_chunk = data_chunk[1]
+        if sync_chunk is None:
+            raise RuntimeError(f'Sync video returned None {video_id}')
+        clip_chunk = clip_chunk[:self.clip_expected_length]
+        # import ipdb
+        # ipdb.set_trace()
+        if clip_chunk.shape[0] != self.clip_expected_length:
+            current_length = clip_chunk.shape[0]
+            padding_needed = self.clip_expected_length - current_length
+            # Check that padding needed is no more than 2
+            assert padding_needed < 4, f'Padding no more than 2 frames allowed, but {padding_needed} needed'
+            # If assertion passes, proceed with padding
+            if padding_needed > 0:
+                last_frame = clip_chunk[-1]
+                log.info(last_frame.shape)
+                # Repeat the last frame to reach the expected length
+                padding = last_frame.repeat(padding_needed, 1, 1, 1)
+                clip_chunk = torch.cat((clip_chunk, padding), dim=0)
+            # raise RuntimeError(f'CLIP video wrong length {video_id}, '
+            #                    f'expected {self.clip_expected_length}, '
+            #                    f'got {clip_chunk.shape[0]}')
+        # save_image(clip_chunk[0] / 255.0,'ori.png')
+        clip_chunk = pad_to_square(clip_chunk)
+        clip_chunk = self.clip_processor(images=clip_chunk, return_tensors="pt")["pixel_values"]
+        sync_chunk = sync_chunk[:self.sync_expected_length]
+        if sync_chunk.shape[0] != self.sync_expected_length:
+            # padding using the last frame, but no more than 2
+            current_length = sync_chunk.shape[0]
+            last_frame = sync_chunk[-1]
+            padding = last_frame.repeat(self.sync_expected_length - current_length, 1, 1, 1)
+            assert self.sync_expected_length - current_length < 12, f'sync can pad no more than 2 while {self.sync_expected_length - current_length}'
+            sync_chunk = torch.cat((sync_chunk, padding), dim=0)
+            # raise RuntimeError(f'Sync video wrong length {video_id}, '
+            #                    f'expected {self.sync_expected_length}, '
+            #                    f'got {sync_chunk.shape[0]}')
+        sync_chunk = self.sync_transform(sync_chunk)
+        # assert audio_chunk.shape[1] == self.expected_audio_length and clip_chunk.shape[0] == self.clip_expected_length \
+        # and sync_chunk.shape[0] == self.sync_expected_length, 'error processed data shape'
+        data = {
+            'id': video_id,
+            'caption': label,
+            'caption_cot': cot,
+            # 'audio': audio_chunk,
+            'clip_video': clip_chunk,
+            'sync_video': sync_chunk,
+        }
+        return data
+# 检查设备
+if torch.cuda.is_available():
+    device = 'cuda'
+    extra_device = 'cuda:1' if torch.cuda.device_count() > 1 else 'cuda:0'
+else:
+    device = 'cpu'
+    extra_device = 'cpu'
+print(f"load in device {device}")
+vae_ckpt = hf_hub_download(repo_id="FunAudioLLM/ThinkSound", filename="vae.ckpt",repo_type="model")
+synchformer_ckpt = hf_hub_download(repo_id="FunAudioLLM/ThinkSound", filename="synchformer_state_dict.pth",repo_type="model")
+feature_extractor = FeaturesUtils(
+    vae_ckpt=None,
+    vae_config='ThinkSound/configs/model_configs/stable_audio_2_0_vae.json',
+    enable_conditions=True,
+    synchformer_ckpt=synchformer_ckpt
+).eval().to(extra_device)
+args = get_all_args()
+seed = 10086
+seed_everything(seed, workers=True)
+#Get JSON config from args.model_config
+with open("ThinkSound/configs/model_configs/thinksound.json") as f:
+    model_config = json.load(f)
+diffusion_model = create_model_from_config(model_config)
+ckpt_path = hf_hub_download(repo_id="FunAudioLLM/ThinkSound", filename="thinksound_light.ckpt",repo_type="model")
+diffusion_model.load_state_dict(torch.load(ckpt_path))
+diffusion_model.to(device)
+## speed by torch.compile
+if args.compile:
+    diffusion_model = torch.compile(diffusion_model)
+load_vae_state = load_ckpt_state_dict(vae_ckpt, prefix='autoencoder.')
+# new_state_dict = {k.replace("autoencoder.", ""): v for k, v in load_vae_state.items() if k.startswith("autoencoder.")}
+diffusion_model.pretransform.load_state_dict(load_vae_state)
+def get_video_duration(video_path):
+    video = VideoFileClip(video_path)
+    return video.duration
+@spaces.GPU(duration=60)
+@torch.inference_mode()
+@torch.no_grad()
+def synthesize_video_with_audio(video_file, caption, cot):
+    yield "⏳ Extracting Features…", None
+    video_path = video_file
+    if caption is None:
+        caption = ''
+    if cot is None:
+        cot = caption
+    timer = Timer(duration="00:15:00:00")
+    #get video duration
+    duration_sec = get_video_duration(video_path)
+    print(duration_sec)
+    preprocesser = VGGSound(duration_sec=duration_sec)
+    data = preprocesser.sample(video_path, caption, cot)
+    preprocessed_data = {}
+    metaclip_global_text_features, metaclip_text_features = feature_extractor.encode_text(data['caption'])
+    preprocessed_data['metaclip_global_text_features'] = metaclip_global_text_features.detach().cpu().squeeze(0)
+    preprocessed_data['metaclip_text_features'] = metaclip_text_features.detach().cpu().squeeze(0)
+    t5_features = feature_extractor.encode_t5_text(data['caption_cot'])
+    preprocessed_data['t5_features'] = t5_features.detach().cpu().squeeze(0)
+    clip_features = feature_extractor.encode_video_with_clip(data['clip_video'].unsqueeze(0).to(extra_device))
+    preprocessed_data['metaclip_features'] = clip_features.detach().cpu().squeeze(0)
+    sync_features = feature_extractor.encode_video_with_sync(data['sync_video'].unsqueeze(0).to(extra_device))
+    preprocessed_data['sync_features'] = sync_features.detach().cpu().squeeze(0)
+    preprocessed_data['video_exist'] = torch.tensor(True)
+    print("clip_shape", preprocessed_data['metaclip_features'].shape)
+    print("sync_shape", preprocessed_data['sync_features'].shape)
+    sync_seq_len = preprocessed_data['sync_features'].shape[0]
+    clip_seq_len = preprocessed_data['metaclip_features'].shape[0]
+    latent_seq_len = (int)(194/9*duration_sec)
+    diffusion_model.model.model.update_seq_lengths(latent_seq_len, clip_seq_len, sync_seq_len)
+    metadata = [preprocessed_data]
+    batch_size = 1
+    length = latent_seq_len
+    with torch.amp.autocast(device):
+        conditioning = diffusion_model.conditioner(metadata, device)
+    video_exist = torch.stack([item['video_exist'] for item in metadata],dim=0)
+    conditioning['metaclip_features'][~video_exist] = diffusion_model.model.model.empty_clip_feat
+    conditioning['sync_features'][~video_exist] = diffusion_model.model.model.empty_sync_feat
+    yield "⏳ Inferring…", None
+    cond_inputs = diffusion_model.get_conditioning_inputs(conditioning)
+    noise = torch.randn([batch_size, diffusion_model.io_channels, length]).to(device)
+    with torch.amp.autocast(device):
+        if diffusion_model.diffusion_objective == "v":
+            fakes = sample(diffusion_model.model, noise, 24, 0, **cond_inputs, cfg_scale=5, batch_cfg=True)
+        elif diffusion_model.diffusion_objective == "rectified_flow":
+            import time
+            start_time = time.time()
+            fakes = sample_discrete_euler(diffusion_model.model, noise, 24, **cond_inputs, cfg_scale=5, batch_cfg=True)
+            end_time = time.time()
+            execution_time = end_time - start_time
+            print(f"execution_time: {execution_time:.2f} 秒")
+        if diffusion_model.pretransform is not None:
+            fakes = diffusion_model.pretransform.decode(fakes)
+    audios = fakes.to(torch.float32).div(torch.max(torch.abs(fakes))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
+    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_audio:
+        torchaudio.save(tmp_audio.name, audios[0], 44100)
+        audio_path = tmp_audio.name
+    with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_video:
+        output_video_path = tmp_video.name
+    cmd = [
+        'ffmpeg', '-y', '-i', video_file, '-i', audio_path,
+        '-c:v', 'copy', '-map', '0:v:0', '-map', '1:a:0',
+        '-shortest', output_video_path
+    ]
+    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    # return output_video_path
+    yield "✅ Generation completed!", output_video_path
+demo = gr.Interface(
+    fn=synthesize_video_with_audio,
+    inputs=[
+        gr.Video(label="Upload Video"),
+        gr.Textbox(label="Caption (optional)", placeholder="can be empty",),
+        gr.Textbox(label="CoT Description (optional)", lines=6, placeholder="can be empty",),
+    ],
+    outputs=[
+        gr.Text(label="Status"),
+        gr.Video(label="Result"),
+    ],
+    title="ThinkSound Demo",
+    description="Upload a video, caption, or CoT to generate audio. For an enhanced experience, we automatically merge the generated audio with your original silent video. (Note: Flexible audio generation lengths are supported.:)",
+    examples=[
+        ["examples/3_mute.mp4", "Gentle Sucking Sounds From the Pacifier", "Begin by creating a soft, steady background of light pacifier suckling. Add subtle, breathy rhythms to mimic a newborn's gentle mouth movements. Keep the sound smooth, natural, and soothing."],
+        ["examples/2_mute.mp4", "Printer Printing", "Generate a continuous printer printing sound with periodic beeps and paper movement, plus a cat pawing at the machine. Add subtle ambient room noise for authenticity, keeping the focus on printing, beeps, and the cat's interaction."],
+        ["examples/5_mute.mp4", "Lighting Firecrackers", "Generate the sound of firecrackers lighting and exploding repeatedly on the ground, followed by fireworks bursting in the sky. Incorporate occasional subtle echoes to mimic an outdoor night ambiance, with no human voices present."],
+        ["examples/4_mute.mp4", "Plastic Debris Handling", "Begin with the sound of hands scooping up loose plastic debris, followed by the subtle cascading noise as the pieces fall and scatter back down. Include soft crinkling and rustling to emphasize the texture of the plastic. Add ambient factory background noise with distant machinery to create an industrial atmosphere."]
+    ],
+    cache_examples=True
+)
+if __name__ == "__main__":
+    demo.queue().launch(share=True)
+demo.launch(share=True)

cot_vgg_demo_caption.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ demo.npz

data_utils/ext/synchformer/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Vladimir Iashin
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

data_utils/ext/synchformer/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from data_utils.ext.synchformer.synchformer import Synchformer

data_utils/ext/synchformer/divided_224_16x4.yaml ADDED Viewed

	@@ -0,0 +1,84 @@

+TRAIN:
+  ENABLE: True
+  DATASET: Ssv2
+  BATCH_SIZE: 32
+  EVAL_PERIOD: 5
+  CHECKPOINT_PERIOD: 5
+  AUTO_RESUME: True
+  CHECKPOINT_EPOCH_RESET: True
+  CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth
+DATA:
+  NUM_FRAMES: 16
+  SAMPLING_RATE: 4
+  TRAIN_JITTER_SCALES: [256, 320]
+  TRAIN_CROP_SIZE: 224
+  TEST_CROP_SIZE: 224
+  INPUT_CHANNEL_NUM: [3]
+  MEAN: [0.5, 0.5, 0.5]
+  STD: [0.5, 0.5, 0.5]
+  PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2
+  PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames
+  INV_UNIFORM_SAMPLE: True
+  RANDOM_FLIP: False
+  REVERSE_INPUT_CHANNEL: True
+  USE_RAND_AUGMENT: True
+  RE_PROB: 0.0
+  USE_REPEATED_AUG: False
+  USE_RANDOM_RESIZE_CROPS: False
+  COLORJITTER: False
+  GRAYSCALE: False
+  GAUSSIAN: False
+SOLVER:
+  BASE_LR: 1e-4
+  LR_POLICY: steps_with_relative_lrs
+  LRS: [1, 0.1, 0.01]
+  STEPS: [0, 20, 30]
+  MAX_EPOCH: 35
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 5e-2
+  WARMUP_EPOCHS: 0.0
+  OPTIMIZING_METHOD: adamw
+  USE_MIXED_PRECISION: True
+  SMOOTHING: 0.2
+SLOWFAST:
+  ALPHA: 8
+VIT:
+  PATCH_SIZE: 16
+  PATCH_SIZE_TEMP: 2
+  CHANNELS: 3
+  EMBED_DIM: 768
+  DEPTH: 12
+  NUM_HEADS: 12
+  MLP_RATIO: 4
+  QKV_BIAS: True
+  VIDEO_INPUT: True
+  TEMPORAL_RESOLUTION: 8
+  USE_MLP: True
+  DROP: 0.0
+  POS_DROPOUT: 0.0
+  DROP_PATH: 0.2
+  IM_PRETRAINED: True
+  HEAD_DROPOUT: 0.0
+  HEAD_ACT: tanh
+  PRETRAINED_WEIGHTS: vit_1k
+  ATTN_LAYER: divided
+MODEL:
+  NUM_CLASSES: 174
+  ARCH: slow
+  MODEL_NAME: VisionTransformer
+  LOSS_FUNC: cross_entropy
+TEST:
+  ENABLE: True
+  DATASET: Ssv2
+  BATCH_SIZE: 64
+  NUM_ENSEMBLE_VIEWS: 1
+  NUM_SPATIAL_CROPS: 3
+DATA_LOADER:
+  NUM_WORKERS: 4
+  PIN_MEMORY: True
+NUM_GPUS: 8
+NUM_SHARDS: 4
+RNG_SEED: 0
+OUTPUT_DIR: .
+TENSORBOARD:
+  ENABLE: True

data_utils/ext/synchformer/motionformer.py ADDED Viewed

	@@ -0,0 +1,400 @@

+import logging
+from pathlib import Path
+import einops
+import torch
+from omegaconf import OmegaConf
+from timm.layers import trunc_normal_
+from torch import nn
+from data_utils.ext.synchformer.utils import check_if_file_exists_else_download
+from data_utils.ext.synchformer.video_model_builder import VisionTransformer
+FILE2URL = {
+    # cfg
+    'motionformer_224_16x4.yaml':
+    'https://raw.githubusercontent.com/facebookresearch/Motionformer/bf43d50/configs/SSV2/motionformer_224_16x4.yaml',
+    'joint_224_16x4.yaml':
+    'https://raw.githubusercontent.com/facebookresearch/Motionformer/bf43d50/configs/SSV2/joint_224_16x4.yaml',
+    'divided_224_16x4.yaml':
+    'https://raw.githubusercontent.com/facebookresearch/Motionformer/bf43d50/configs/SSV2/divided_224_16x4.yaml',
+    # ckpt
+    'ssv2_motionformer_224_16x4.pyth':
+    'https://dl.fbaipublicfiles.com/motionformer/ssv2_motionformer_224_16x4.pyth',
+    'ssv2_joint_224_16x4.pyth':
+    'https://dl.fbaipublicfiles.com/motionformer/ssv2_joint_224_16x4.pyth',
+    'ssv2_divided_224_16x4.pyth':
+    'https://dl.fbaipublicfiles.com/motionformer/ssv2_divided_224_16x4.pyth',
+}
+class MotionFormer(VisionTransformer):
+    ''' This class serves three puposes:
+            1. Renames the class to MotionFormer.
+            2. Downloads the cfg from the original repo and patches it if needed.
+            3. Takes care of feature extraction by redefining .forward()
+                - if `extract_features=True` and `factorize_space_time=False`,
+                    the output is of shape (B, T, D) where T = 1 + (224 // 16) * (224 // 16) * 8
+                - if `extract_features=True` and `factorize_space_time=True`, the output is of shape (B*S, D)
+                    and spatial and temporal transformer encoder layers are used.
+                - if `extract_features=True` and `factorize_space_time=True` as well as `add_global_repr=True`
+                    the output is of shape (B, D) and spatial and temporal transformer encoder layers
+                    are used as well as the global representation is extracted from segments (extra pos emb
+                    is added).
+    '''
+    def __init__(
+        self,
+        extract_features: bool = False,
+        ckpt_path: str = None,
+        factorize_space_time: bool = None,
+        agg_space_module: str = None,
+        agg_time_module: str = None,
+        add_global_repr: bool = True,
+        agg_segments_module: str = None,
+        max_segments: int = None,
+    ):
+        self.extract_features = extract_features
+        self.ckpt_path = ckpt_path
+        self.factorize_space_time = factorize_space_time
+        if self.ckpt_path is not None:
+            check_if_file_exists_else_download(self.ckpt_path, FILE2URL)
+            ckpt = torch.load(self.ckpt_path, map_location='cpu')
+            mformer_ckpt2cfg = {
+                'ssv2_motionformer_224_16x4.pyth': 'motionformer_224_16x4.yaml',
+                'ssv2_joint_224_16x4.pyth': 'joint_224_16x4.yaml',
+                'ssv2_divided_224_16x4.pyth': 'divided_224_16x4.yaml',
+            }
+            # init from motionformer ckpt or from our Stage I ckpt
+            # depending on whether the feat extractor was pre-trained on AVCLIPMoCo or not, we need to
+            # load the state dict differently
+            was_pt_on_avclip = self.ckpt_path.endswith(
+                '.pt')  # checks if it is a stage I ckpt (FIXME: a bit generic)
+            if self.ckpt_path.endswith(tuple(mformer_ckpt2cfg.keys())):
+                cfg_fname = mformer_ckpt2cfg[Path(self.ckpt_path).name]
+            elif was_pt_on_avclip:
+                # TODO: this is a hack, we should be able to get the cfg from the ckpt (earlier ckpt didn't have it)
+                s1_cfg = ckpt.get('args', None)  # Stage I cfg
+                if s1_cfg is not None:
+                    s1_vfeat_extractor_ckpt_path = s1_cfg.model.params.vfeat_extractor.params.ckpt_path
+                    # if the stage I ckpt was initialized from a motionformer ckpt or train from scratch
+                    if s1_vfeat_extractor_ckpt_path is not None:
+                        cfg_fname = mformer_ckpt2cfg[Path(s1_vfeat_extractor_ckpt_path).name]
+                    else:
+                        cfg_fname = 'divided_224_16x4.yaml'
+                else:
+                    cfg_fname = 'divided_224_16x4.yaml'
+            else:
+                raise ValueError(f'ckpt_path {self.ckpt_path} is not supported.')
+        else:
+            was_pt_on_avclip = False
+            cfg_fname = 'divided_224_16x4.yaml'
+            # logging.info(f'No ckpt_path provided, using {cfg_fname} config.')
+        if cfg_fname in ['motionformer_224_16x4.yaml', 'divided_224_16x4.yaml']:
+            pos_emb_type = 'separate'
+        elif cfg_fname == 'joint_224_16x4.yaml':
+            pos_emb_type = 'joint'
+        self.mformer_cfg_path = Path(__file__).absolute().parent / cfg_fname
+        check_if_file_exists_else_download(self.mformer_cfg_path, FILE2URL)
+        mformer_cfg = OmegaConf.load(self.mformer_cfg_path)
+        logging.info(f'Loading MotionFormer config from {self.mformer_cfg_path.absolute()}')
+        # patch the cfg (from the default cfg defined in the repo `Motionformer/slowfast/config/defaults.py`)
+        mformer_cfg.VIT.ATTN_DROPOUT = 0.0
+        mformer_cfg.VIT.POS_EMBED = pos_emb_type
+        mformer_cfg.VIT.USE_ORIGINAL_TRAJ_ATTN_CODE = True
+        mformer_cfg.VIT.APPROX_ATTN_TYPE = 'none'  # guessing
+        mformer_cfg.VIT.APPROX_ATTN_DIM = 64  # from ckpt['cfg']
+        # finally init VisionTransformer with the cfg
+        super().__init__(mformer_cfg)
+        # load the ckpt now if ckpt is provided and not from AVCLIPMoCo-pretrained ckpt
+        if (self.ckpt_path is not None) and (not was_pt_on_avclip):
+            _ckpt_load_status = self.load_state_dict(ckpt['model_state'], strict=False)
+            if len(_ckpt_load_status.missing_keys) > 0 or len(
+                    _ckpt_load_status.unexpected_keys) > 0:
+                logging.warning(f'Loading exact vfeat_extractor ckpt from {self.ckpt_path} failed.' \
+                                f'Missing keys: {_ckpt_load_status.missing_keys}, ' \
+                                f'Unexpected keys: {_ckpt_load_status.unexpected_keys}')
+            else:
+                logging.info(f'Loading vfeat_extractor ckpt from {self.ckpt_path} succeeded.')
+        if self.extract_features:
+            assert isinstance(self.norm,
+                              nn.LayerNorm), 'early x[:, 1:, :] may not be safe for per-tr weights'
+            # pre-logits are Sequential(nn.Linear(emb, emd), act) and `act` is tanh but see the logger
+            self.pre_logits = nn.Identity()
+            # we don't need the classification head (saving memory)
+            self.head = nn.Identity()
+            self.head_drop = nn.Identity()
+            # avoiding code duplication (used only if agg_*_module is TransformerEncoderLayer)
+            transf_enc_layer_kwargs = dict(
+                d_model=self.embed_dim,
+                nhead=self.num_heads,
+                activation=nn.GELU(),
+                batch_first=True,
+                dim_feedforward=self.mlp_ratio * self.embed_dim,
+                dropout=self.drop_rate,
+                layer_norm_eps=1e-6,
+                norm_first=True,
+            )
+            # define adapters if needed
+            if self.factorize_space_time:
+                if agg_space_module == 'TransformerEncoderLayer':
+                    self.spatial_attn_agg = SpatialTransformerEncoderLayer(
+                        **transf_enc_layer_kwargs)
+                elif agg_space_module == 'AveragePooling':
+                    self.spatial_attn_agg = AveragePooling(avg_pattern='BS D t h w -> BS D t',
+                                                           then_permute_pattern='BS D t -> BS t D')
+                if agg_time_module == 'TransformerEncoderLayer':
+                    self.temp_attn_agg = TemporalTransformerEncoderLayer(**transf_enc_layer_kwargs)
+                elif agg_time_module == 'AveragePooling':
+                    self.temp_attn_agg = AveragePooling(avg_pattern='BS t D -> BS D')
+                elif 'Identity' in agg_time_module:
+                    self.temp_attn_agg = nn.Identity()
+            # define a global aggregation layer (aggregarate over segments)
+            self.add_global_repr = add_global_repr
+            if add_global_repr:
+                if agg_segments_module == 'TransformerEncoderLayer':
+                    # we can reuse the same layer as for temporal factorization (B, dim_to_agg, D) -> (B, D)
+                    # we need to add pos emb (PE) because previously we added the same PE for each segment
+                    pos_max_len = max_segments if max_segments is not None else 16  # 16 = 10sec//0.64sec + 1
+                    self.global_attn_agg = TemporalTransformerEncoderLayer(
+                        add_pos_emb=True,
+                        pos_emb_drop=mformer_cfg.VIT.POS_DROPOUT,
+                        pos_max_len=pos_max_len,
+                        **transf_enc_layer_kwargs)
+                elif agg_segments_module == 'AveragePooling':
+                    self.global_attn_agg = AveragePooling(avg_pattern='B S D -> B D')
+        if was_pt_on_avclip:
+            # we need to filter out the state_dict of the AVCLIP model (has both A and V extractors)
+            # and keep only the state_dict of the feat extractor
+            ckpt_weights = dict()
+            for k, v in ckpt['state_dict'].items():
+                if k.startswith(('module.v_encoder.', 'v_encoder.')):
+                    k = k.replace('module.', '').replace('v_encoder.', '')
+                    ckpt_weights[k] = v
+            _load_status = self.load_state_dict(ckpt_weights, strict=False)
+            if len(_load_status.missing_keys) > 0 or len(_load_status.unexpected_keys) > 0:
+                logging.warning(f'Loading exact vfeat_extractor ckpt from {self.ckpt_path} failed. \n' \
+                                f'Missing keys ({len(_load_status.missing_keys)}): ' \
+                                f'{_load_status.missing_keys}, \n' \
+                                f'Unexpected keys ({len(_load_status.unexpected_keys)}): ' \
+                                f'{_load_status.unexpected_keys} \n' \
+                                f'temp_attn_agg are expected to be missing if ckpt was pt contrastively.')
+            else:
+                logging.info(f'Loading vfeat_extractor ckpt from {self.ckpt_path} succeeded.')
+        # patch_embed is not used in MotionFormer, only patch_embed_3d, because cfg.VIT.PATCH_SIZE_TEMP > 1
+        # but it used to calculate the number of patches, so we need to set keep it
+        self.patch_embed.requires_grad_(False)
+    def forward(self, x):
+        '''
+        x is of shape (B, S, C, T, H, W) where S is the number of segments.
+        '''
+        # Batch, Segments, Channels, T=frames, Height, Width
+        B, S, C, T, H, W = x.shape
+        # Motionformer expects a tensor of shape (1, B, C, T, H, W).
+        # The first dimension (1) is a dummy dimension to make the input tensor and won't be used:
+        # see `video_model_builder.video_input`.
+        # x = x.unsqueeze(0)  # (1, B, S, C, T, H, W)
+        orig_shape = (B, S, C, T, H, W)
+        x = x.view(B * S, C, T, H, W)  # flatten batch and segments
+        x = self.forward_segments(x, orig_shape=orig_shape)
+        # unpack the segments (using rest dimensions to support different shapes e.g. (BS, D) or (BS, t, D))
+        x = x.view(B, S, *x.shape[1:])
+        # x is now of shape (B*S, D) or (B*S, t, D) if `self.temp_attn_agg` is `Identity`
+        return x  # x is (B, S, ...)
+    def forward_segments(self, x, orig_shape: tuple) -> torch.Tensor:
+        '''x is of shape (1, BS, C, T, H, W) where S is the number of segments.'''
+        x, x_mask = self.forward_features(x)
+        assert self.extract_features
+        # (BS, T, D) where T = 1 + (224 // 16) * (224 // 16) * 8
+        x = x[:,
+              1:, :]  # without the CLS token for efficiency (should be safe for LayerNorm and FC)
+        x = self.norm(x)
+        x = self.pre_logits(x)
+        if self.factorize_space_time:
+            x = self.restore_spatio_temp_dims(x, orig_shape)  # (B*S, D, t, h, w) <- (B*S, t*h*w, D)
+            x = self.spatial_attn_agg(x, x_mask)  # (B*S, t, D)
+            x = self.temp_attn_agg(
+                x)  # (B*S, D) or (BS, t, D) if `self.temp_attn_agg` is `Identity`
+        return x
+    def restore_spatio_temp_dims(self, feats: torch.Tensor, orig_shape: tuple) -> torch.Tensor:
+        '''
+            feats are of shape (B*S, T, D) where T = 1 + (224 // 16) * (224 // 16) * 8
+            Our goal is to make them of shape (B*S, t, h, w, D) where h, w are the spatial dimensions.
+            From `self.patch_embed_3d`, it follows that we could reshape feats with:
+                `feats.transpose(1, 2).view(B*S, D, t, h, w)`
+        '''
+        B, S, C, T, H, W = orig_shape
+        D = self.embed_dim
+        # num patches in each dimension
+        t = T // self.patch_embed_3d.z_block_size
+        h = self.patch_embed_3d.height
+        w = self.patch_embed_3d.width
+        feats = feats.permute(0, 2, 1)  # (B*S, D, T)
+        feats = feats.view(B * S, D, t, h, w)  # (B*S, D, t, h, w)
+        return feats
+class BaseEncoderLayer(nn.TransformerEncoderLayer):
+    '''
+        This is a wrapper around nn.TransformerEncoderLayer that adds a CLS token
+        to the sequence and outputs the CLS token's representation.
+        This base class parents both SpatialEncoderLayer and TemporalEncoderLayer for the RGB stream
+        and the FrequencyEncoderLayer and TemporalEncoderLayer for the audio stream stream.
+        We also, optionally, add a positional embedding to the input sequence which
+        allows to reuse it for global aggregation (of segments) for both streams.
+    '''
+    def __init__(self,
+                 add_pos_emb: bool = False,
+                 pos_emb_drop: float = None,
+                 pos_max_len: int = None,
+                 *args_transformer_enc,
+                 **kwargs_transformer_enc):
+        super().__init__(*args_transformer_enc, **kwargs_transformer_enc)
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.self_attn.embed_dim))
+        trunc_normal_(self.cls_token, std=.02)
+        # add positional embedding
+        self.add_pos_emb = add_pos_emb
+        if add_pos_emb:
+            self.pos_max_len = 1 + pos_max_len  # +1 (for CLS)
+            self.pos_emb = nn.Parameter(torch.zeros(1, self.pos_max_len, self.self_attn.embed_dim))
+            self.pos_drop = nn.Dropout(pos_emb_drop)
+            trunc_normal_(self.pos_emb, std=.02)
+        self.apply(self._init_weights)
+    def forward(self, x: torch.Tensor, x_mask: torch.Tensor = None):
+        ''' x is of shape (B, N, D); if provided x_mask is of shape (B, N)'''
+        batch_dim = x.shape[0]
+        # add CLS token
+        cls_tokens = self.cls_token.expand(batch_dim, -1, -1)  # expanding to match batch dimension
+        x = torch.cat((cls_tokens, x), dim=-2)  # (batch_dim, 1+seq_len, D)
+        if x_mask is not None:
+            cls_mask = torch.ones((batch_dim, 1), dtype=torch.bool,
+                                  device=x_mask.device)  # 1=keep; 0=mask
+            x_mask_w_cls = torch.cat((cls_mask, x_mask), dim=-1)  # (batch_dim, 1+seq_len)
+            B, N = x_mask_w_cls.shape
+            # torch expects (N, N) or (B*num_heads, N, N) mask (sadness ahead); torch masks
+            x_mask_w_cls = x_mask_w_cls.reshape(B, 1, 1, N)\
+                                       .expand(-1, self.self_attn.num_heads, N, -1)\
+                                       .reshape(B * self.self_attn.num_heads, N, N)
+            assert x_mask_w_cls.dtype == x_mask_w_cls.bool().dtype, 'x_mask_w_cls.dtype != bool'
+            x_mask_w_cls = ~x_mask_w_cls  # invert mask (1=mask)
+        else:
+            x_mask_w_cls = None
+        # add positional embedding
+        if self.add_pos_emb:
+            seq_len = x.shape[
+                1]  # (don't even think about moving it before the CLS token concatenation)
+            assert seq_len <= self.pos_max_len, f'Seq len ({seq_len}) > pos_max_len ({self.pos_max_len})'
+            x = x + self.pos_emb[:, :seq_len, :]
+            x = self.pos_drop(x)
+        # apply encoder layer (calls nn.TransformerEncoderLayer.forward);
+        x = super().forward(src=x, src_mask=x_mask_w_cls)  # (batch_dim, 1+seq_len, D)
+        # CLS token is expected to hold spatial information for each frame
+        x = x[:, 0, :]  # (batch_dim, D)
+        return x
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'cls_token', 'pos_emb'}
+class SpatialTransformerEncoderLayer(BaseEncoderLayer):
+    ''' Aggregates spatial dimensions by applying attention individually to each frame. '''
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x: torch.Tensor, x_mask: torch.Tensor = None) -> torch.Tensor:
+        ''' x is of shape (B*S, D, t, h, w) where S is the number of segments.
+            if specified x_mask (B*S, t, h, w), 0=masked, 1=kept
+            Returns a tensor of shape (B*S, t, D) pooling spatial information for each frame. '''
+        BS, D, t, h, w = x.shape
+        # time as a batch dimension and flatten spatial dimensions as sequence
+        x = einops.rearrange(x, 'BS D t h w -> (BS t) (h w) D')
+        # similar to mask
+        if x_mask is not None:
+            x_mask = einops.rearrange(x_mask, 'BS t h w -> (BS t) (h w)')
+        # apply encoder layer (BaseEncoderLayer.forward) - it will add CLS token and output its representation
+        x = super().forward(x=x, x_mask=x_mask)  # (B*S*t, D)
+        # reshape back to (B*S, t, D)
+        x = einops.rearrange(x, '(BS t) D -> BS t D', BS=BS, t=t)
+        # (B*S, t, D)
+        return x
+class TemporalTransformerEncoderLayer(BaseEncoderLayer):
+    ''' Aggregates temporal dimension with attention. Also used with pos emb as global aggregation
+    in both streams. '''
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x):
+        ''' x is of shape (B*S, t, D) where S is the number of segments.
+            Returns a tensor of shape (B*S, D) pooling temporal information. '''
+        BS, t, D = x.shape
+        # apply encoder layer (BaseEncoderLayer.forward) - it will add CLS token and output its representation
+        x = super().forward(x)  # (B*S, D)
+        return x  # (B*S, D)
+class AveragePooling(nn.Module):
+    def __init__(self, avg_pattern: str, then_permute_pattern: str = None) -> None:
+        ''' patterns are e.g. "bs t d -> bs d" '''
+        super().__init__()
+        # TODO: need to register them as buffers (but fails because these are strings)
+        self.reduce_fn = 'mean'
+        self.avg_pattern = avg_pattern
+        self.then_permute_pattern = then_permute_pattern
+    def forward(self, x: torch.Tensor, x_mask: torch.Tensor = None) -> torch.Tensor:
+        x = einops.reduce(x, self.avg_pattern, self.reduce_fn)
+        if self.then_permute_pattern is not None:
+            x = einops.rearrange(x, self.then_permute_pattern)
+        return x

data_utils/ext/synchformer/synchformer.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import logging
+from typing import Any, Mapping
+import torch
+from torch import nn
+from data_utils.ext.synchformer.motionformer import MotionFormer
+class Synchformer(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.vfeat_extractor = MotionFormer(extract_features=True,
+                                            factorize_space_time=True,
+                                            agg_space_module='TransformerEncoderLayer',
+                                            agg_time_module='torch.nn.Identity',
+                                            add_global_repr=False)
+        # self.vfeat_extractor = instantiate_from_config(vfeat_extractor)
+        # self.afeat_extractor = instantiate_from_config(afeat_extractor)
+        # # bridging the s3d latent dim (1024) into what is specified in the config
+        # # to match e.g. the transformer dim
+        # self.vproj = instantiate_from_config(vproj)
+        # self.aproj = instantiate_from_config(aproj)
+        # self.transformer = instantiate_from_config(transformer)
+    def forward(self, vis):
+        B, S, Tv, C, H, W = vis.shape
+        vis = vis.permute(0, 1, 3, 2, 4, 5)  # (B, S, C, Tv, H, W)
+        # feat extractors return a tuple of segment-level and global features (ignored for sync)
+        # (B, S, tv, D), e.g. (B, 7, 8, 768)
+        vis = self.vfeat_extractor(vis)
+        return vis
+    def load_state_dict(self, sd: Mapping[str, Any], strict: bool = True):
+        # discard all entries except vfeat_extractor
+        sd = {k: v for k, v in sd.items() if k.startswith('vfeat_extractor')}
+        return super().load_state_dict(sd, strict)
+if __name__ == "__main__":
+    model = Synchformer().cuda().eval()
+    sd = torch.load('./ext_weights/synchformer_state_dict.pth', weights_only=True)
+    model.load_state_dict(sd)
+    vid = torch.randn(2, 7, 16, 3, 224, 224).cuda()
+    features = model.extract_vfeats(vid, for_loop=False).detach().cpu()
+    print(features.shape)
+    # extract and save the state dict only
+    # sd = torch.load('./ext_weights/sync_model_audioset.pt')['model']
+    # torch.save(sd, './ext_weights/synchformer_state_dict.pth')

data_utils/ext/synchformer/utils.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from hashlib import md5
+from pathlib import Path
+import requests
+from tqdm import tqdm
+PARENT_LINK = 'https://a3s.fi/swift/v1/AUTH_a235c0f452d648828f745589cde1219a'
+FNAME2LINK = {
+    # S3: Synchability: AudioSet (run 2)
+    '24-01-22T20-34-52.pt':
+    f'{PARENT_LINK}/sync/sync_models/24-01-22T20-34-52/24-01-22T20-34-52.pt',
+    'cfg-24-01-22T20-34-52.yaml':
+    f'{PARENT_LINK}/sync/sync_models/24-01-22T20-34-52/cfg-24-01-22T20-34-52.yaml',
+    # S2: Synchformer: AudioSet (run 2)
+    '24-01-04T16-39-21.pt':
+    f'{PARENT_LINK}/sync/sync_models/24-01-04T16-39-21/24-01-04T16-39-21.pt',
+    'cfg-24-01-04T16-39-21.yaml':
+    f'{PARENT_LINK}/sync/sync_models/24-01-04T16-39-21/cfg-24-01-04T16-39-21.yaml',
+    # S2: Synchformer: AudioSet (run 1)
+    '23-08-28T11-23-23.pt':
+    f'{PARENT_LINK}/sync/sync_models/23-08-28T11-23-23/23-08-28T11-23-23.pt',
+    'cfg-23-08-28T11-23-23.yaml':
+    f'{PARENT_LINK}/sync/sync_models/23-08-28T11-23-23/cfg-23-08-28T11-23-23.yaml',
+    # S2: Synchformer: LRS3 (run 2)
+    '23-12-23T18-33-57.pt':
+    f'{PARENT_LINK}/sync/sync_models/23-12-23T18-33-57/23-12-23T18-33-57.pt',
+    'cfg-23-12-23T18-33-57.yaml':
+    f'{PARENT_LINK}/sync/sync_models/23-12-23T18-33-57/cfg-23-12-23T18-33-57.yaml',
+    # S2: Synchformer: VGS (run 2)
+    '24-01-02T10-00-53.pt':
+    f'{PARENT_LINK}/sync/sync_models/24-01-02T10-00-53/24-01-02T10-00-53.pt',
+    'cfg-24-01-02T10-00-53.yaml':
+    f'{PARENT_LINK}/sync/sync_models/24-01-02T10-00-53/cfg-24-01-02T10-00-53.yaml',
+    # SparseSync: ft VGGSound-Full
+    '22-09-21T21-00-52.pt':
+    f'{PARENT_LINK}/sync/sync_models/22-09-21T21-00-52/22-09-21T21-00-52.pt',
+    'cfg-22-09-21T21-00-52.yaml':
+    f'{PARENT_LINK}/sync/sync_models/22-09-21T21-00-52/cfg-22-09-21T21-00-52.yaml',
+    # SparseSync: ft VGGSound-Sparse
+    '22-07-28T15-49-45.pt':
+    f'{PARENT_LINK}/sync/sync_models/22-07-28T15-49-45/22-07-28T15-49-45.pt',
+    'cfg-22-07-28T15-49-45.yaml':
+    f'{PARENT_LINK}/sync/sync_models/22-07-28T15-49-45/cfg-22-07-28T15-49-45.yaml',
+    # SparseSync: only pt on LRS3
+    '22-07-13T22-25-49.pt':
+    f'{PARENT_LINK}/sync/sync_models/22-07-13T22-25-49/22-07-13T22-25-49.pt',
+    'cfg-22-07-13T22-25-49.yaml':
+    f'{PARENT_LINK}/sync/sync_models/22-07-13T22-25-49/cfg-22-07-13T22-25-49.yaml',
+    # SparseSync: feature extractors
+    'ResNetAudio-22-08-04T09-51-04.pt':
+    f'{PARENT_LINK}/sync/ResNetAudio-22-08-04T09-51-04.pt',  # 2s
+    'ResNetAudio-22-08-03T23-14-49.pt':
+    f'{PARENT_LINK}/sync/ResNetAudio-22-08-03T23-14-49.pt',  # 3s
+    'ResNetAudio-22-08-03T23-14-28.pt':
+    f'{PARENT_LINK}/sync/ResNetAudio-22-08-03T23-14-28.pt',  # 4s
+    'ResNetAudio-22-06-24T08-10-33.pt':
+    f'{PARENT_LINK}/sync/ResNetAudio-22-06-24T08-10-33.pt',  # 5s
+    'ResNetAudio-22-06-24T17-31-07.pt':
+    f'{PARENT_LINK}/sync/ResNetAudio-22-06-24T17-31-07.pt',  # 6s
+    'ResNetAudio-22-06-24T23-57-11.pt':
+    f'{PARENT_LINK}/sync/ResNetAudio-22-06-24T23-57-11.pt',  # 7s
+    'ResNetAudio-22-06-25T04-35-42.pt':
+    f'{PARENT_LINK}/sync/ResNetAudio-22-06-25T04-35-42.pt',  # 8s
+}
+def check_if_file_exists_else_download(path, fname2link=FNAME2LINK, chunk_size=1024):
+    '''Checks if file exists, if not downloads it from the link to the path'''
+    path = Path(path)
+    if not path.exists():
+        path.parent.mkdir(exist_ok=True, parents=True)
+        link = fname2link.get(path.name, None)
+        if link is None:
+            raise ValueError(f'Cant find the checkpoint file: {path}.',
+                             f'Please download it manually and ensure the path exists.')
+        with requests.get(fname2link[path.name], stream=True) as r:
+            total_size = int(r.headers.get('content-length', 0))
+            with tqdm(total=total_size, unit='B', unit_scale=True) as pbar:
+                with open(path, 'wb') as f:
+                    for data in r.iter_content(chunk_size=chunk_size):
+                        if data:
+                            f.write(data)
+                            pbar.update(chunk_size)
+def get_md5sum(path):
+    hash_md5 = md5()
+    with open(path, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096 * 8), b''):
+            hash_md5.update(chunk)
+    md5sum = hash_md5.hexdigest()
+    return md5sum

data_utils/ext/synchformer/video_model_builder.py ADDED Viewed

	@@ -0,0 +1,277 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright 2020 Ross Wightman
+# Modified Model definition
+from collections import OrderedDict
+from functools import partial
+import torch
+import torch.nn as nn
+from timm.layers import trunc_normal_
+from data_utils.ext.synchformer import vit_helper
+class VisionTransformer(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage """
+    def __init__(self, cfg):
+        super().__init__()
+        self.img_size = cfg.DATA.TRAIN_CROP_SIZE
+        self.patch_size = cfg.VIT.PATCH_SIZE
+        self.in_chans = cfg.VIT.CHANNELS
+        if cfg.TRAIN.DATASET == "Epickitchens":
+            self.num_classes = [97, 300]
+        else:
+            self.num_classes = cfg.MODEL.NUM_CLASSES
+        self.embed_dim = cfg.VIT.EMBED_DIM
+        self.depth = cfg.VIT.DEPTH
+        self.num_heads = cfg.VIT.NUM_HEADS
+        self.mlp_ratio = cfg.VIT.MLP_RATIO
+        self.qkv_bias = cfg.VIT.QKV_BIAS
+        self.drop_rate = cfg.VIT.DROP
+        self.drop_path_rate = cfg.VIT.DROP_PATH
+        self.head_dropout = cfg.VIT.HEAD_DROPOUT
+        self.video_input = cfg.VIT.VIDEO_INPUT
+        self.temporal_resolution = cfg.VIT.TEMPORAL_RESOLUTION
+        self.use_mlp = cfg.VIT.USE_MLP
+        self.num_features = self.embed_dim
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.attn_drop_rate = cfg.VIT.ATTN_DROPOUT
+        self.head_act = cfg.VIT.HEAD_ACT
+        self.cfg = cfg
+        # Patch Embedding
+        self.patch_embed = vit_helper.PatchEmbed(img_size=224,
+                                                 patch_size=self.patch_size,
+                                                 in_chans=self.in_chans,
+                                                 embed_dim=self.embed_dim)
+        # 3D Patch Embedding
+        self.patch_embed_3d = vit_helper.PatchEmbed3D(img_size=self.img_size,
+                                                      temporal_resolution=self.temporal_resolution,
+                                                      patch_size=self.patch_size,
+                                                      in_chans=self.in_chans,
+                                                      embed_dim=self.embed_dim,
+                                                      z_block_size=self.cfg.VIT.PATCH_SIZE_TEMP)
+        self.patch_embed_3d.proj.weight.data = torch.zeros_like(
+            self.patch_embed_3d.proj.weight.data)
+        # Number of patches
+        if self.video_input:
+            num_patches = self.patch_embed.num_patches * self.temporal_resolution
+        else:
+            num_patches = self.patch_embed.num_patches
+        self.num_patches = num_patches
+        # CLS token
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
+        trunc_normal_(self.cls_token, std=.02)
+        # Positional embedding
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, self.patch_embed.num_patches + 1, self.embed_dim))
+        self.pos_drop = nn.Dropout(p=cfg.VIT.POS_DROPOUT)
+        trunc_normal_(self.pos_embed, std=.02)
+        if self.cfg.VIT.POS_EMBED == "joint":
+            self.st_embed = nn.Parameter(torch.zeros(1, num_patches + 1, self.embed_dim))
+            trunc_normal_(self.st_embed, std=.02)
+        elif self.cfg.VIT.POS_EMBED == "separate":
+            self.temp_embed = nn.Parameter(torch.zeros(1, self.temporal_resolution, self.embed_dim))
+        # Layer Blocks
+        dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, self.depth)]
+        if self.cfg.VIT.ATTN_LAYER == "divided":
+            self.blocks = nn.ModuleList([
+                vit_helper.DividedSpaceTimeBlock(
+                    attn_type=cfg.VIT.ATTN_LAYER,
+                    dim=self.embed_dim,
+                    num_heads=self.num_heads,
+                    mlp_ratio=self.mlp_ratio,
+                    qkv_bias=self.qkv_bias,
+                    drop=self.drop_rate,
+                    attn_drop=self.attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                ) for i in range(self.depth)
+            ])
+        else:
+            self.blocks = nn.ModuleList([
+                vit_helper.Block(attn_type=cfg.VIT.ATTN_LAYER,
+                                 dim=self.embed_dim,
+                                 num_heads=self.num_heads,
+                                 mlp_ratio=self.mlp_ratio,
+                                 qkv_bias=self.qkv_bias,
+                                 drop=self.drop_rate,
+                                 attn_drop=self.attn_drop_rate,
+                                 drop_path=dpr[i],
+                                 norm_layer=norm_layer,
+                                 use_original_code=self.cfg.VIT.USE_ORIGINAL_TRAJ_ATTN_CODE)
+                for i in range(self.depth)
+            ])
+        self.norm = norm_layer(self.embed_dim)
+        # MLP head
+        if self.use_mlp:
+            hidden_dim = self.embed_dim
+            if self.head_act == 'tanh':
+                # logging.info("Using TanH activation in MLP")
+                act = nn.Tanh()
+            elif self.head_act == 'gelu':
+                # logging.info("Using GELU activation in MLP")
+                act = nn.GELU()
+            else:
+                # logging.info("Using ReLU activation in MLP")
+                act = nn.ReLU()
+            self.pre_logits = nn.Sequential(
+                OrderedDict([
+                    ('fc', nn.Linear(self.embed_dim, hidden_dim)),
+                    ('act', act),
+                ]))
+        else:
+            self.pre_logits = nn.Identity()
+        # Classifier Head
+        self.head_drop = nn.Dropout(p=self.head_dropout)
+        if isinstance(self.num_classes, (list, )) and len(self.num_classes) > 1:
+            for a, i in enumerate(range(len(self.num_classes))):
+                setattr(self, "head%d" % a, nn.Linear(self.embed_dim, self.num_classes[i]))
+        else:
+            self.head = nn.Linear(self.embed_dim,
+                                  self.num_classes) if self.num_classes > 0 else nn.Identity()
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        if self.cfg.VIT.POS_EMBED == "joint":
+            return {'pos_embed', 'cls_token', 'st_embed'}
+        else:
+            return {'pos_embed', 'cls_token', 'temp_embed'}
+    def get_classifier(self):
+        return self.head
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = (nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity())
+    def forward_features(self, x):
+        # if self.video_input:
+        # x = x[0]
+        B = x.shape[0]
+        # Tokenize input
+        # if self.cfg.VIT.PATCH_SIZE_TEMP > 1:
+        # for simplicity of mapping between content dimensions (input x) and token dims (after patching)
+        # we use the same trick as for AST (see modeling_ast.ASTModel.forward for the details):
+        # apply patching on input
+        x = self.patch_embed_3d(x)
+        tok_mask = None
+        # else:
+        #     tok_mask = None
+        #     # 2D tokenization
+        #     if self.video_input:
+        #         x = x.permute(0, 2, 1, 3, 4)
+        #         (B, T, C, H, W) = x.shape
+        #         x = x.reshape(B * T, C, H, W)
+        #     x = self.patch_embed(x)
+        #     if self.video_input:
+        #         (B2, T2, D2) = x.shape
+        #         x = x.reshape(B, T * T2, D2)
+        # Append CLS token
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        # if tok_mask is not None:
+        #     # prepend 1(=keep) to the mask to account for the CLS token as well
+        #     tok_mask = torch.cat((torch.ones_like(tok_mask[:, [0]]), tok_mask), dim=1)
+        # Interpolate positinoal embeddings
+        # if self.cfg.DATA.TRAIN_CROP_SIZE != 224:
+        #     pos_embed = self.pos_embed
+        #     N = pos_embed.shape[1] - 1
+        #     npatch = int((x.size(1) - 1) / self.temporal_resolution)
+        #     class_emb = pos_embed[:, 0]
+        #     pos_embed = pos_embed[:, 1:]
+        #     dim = x.shape[-1]
+        #     pos_embed = torch.nn.functional.interpolate(
+        #         pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
+        #         scale_factor=math.sqrt(npatch / N),
+        #         mode='bicubic',
+        #     )
+        #     pos_embed = pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        #     new_pos_embed = torch.cat((class_emb.unsqueeze(0), pos_embed), dim=1)
+        # else:
+        new_pos_embed = self.pos_embed
+        npatch = self.patch_embed.num_patches
+        # Add positional embeddings to input
+        if self.video_input:
+            if self.cfg.VIT.POS_EMBED == "separate":
+                cls_embed = self.pos_embed[:, 0, :].unsqueeze(1)
+                tile_pos_embed = new_pos_embed[:, 1:, :].repeat(1, self.temporal_resolution, 1)
+                tile_temporal_embed = self.temp_embed.repeat_interleave(npatch, 1)
+                total_pos_embed = tile_pos_embed + tile_temporal_embed
+                total_pos_embed = torch.cat([cls_embed, total_pos_embed], dim=1)
+                x = x + total_pos_embed
+            elif self.cfg.VIT.POS_EMBED == "joint":
+                x = x + self.st_embed
+        else:
+            # image input
+            x = x + new_pos_embed
+        # Apply positional dropout
+        x = self.pos_drop(x)
+        # Encoding using transformer layers
+        for i, blk in enumerate(self.blocks):
+            x = blk(x,
+                    seq_len=npatch,
+                    num_frames=self.temporal_resolution,
+                    approx=self.cfg.VIT.APPROX_ATTN_TYPE,
+                    num_landmarks=self.cfg.VIT.APPROX_ATTN_DIM,
+                    tok_mask=tok_mask)
+        ### v-iashin: I moved it to the forward pass
+        # x = self.norm(x)[:, 0]
+        # x = self.pre_logits(x)
+        ###
+        return x, tok_mask
+    # def forward(self, x):
+    #     x = self.forward_features(x)
+    #     ### v-iashin: here. This should leave the same forward output as before
+    #     x = self.norm(x)[:, 0]
+    #     x = self.pre_logits(x)
+    #     ###
+    #     x = self.head_drop(x)
+    #     if isinstance(self.num_classes, (list, )) and len(self.num_classes) > 1:
+    #         output = []
+    #         for head in range(len(self.num_classes)):
+    #             x_out = getattr(self, "head%d" % head)(x)
+    #             if not self.training:
+    #                 x_out = torch.nn.functional.softmax(x_out, dim=-1)
+    #             output.append(x_out)
+    #         return output
+    #     else:
+    #         x = self.head(x)
+    #         if not self.training:
+    #             x = torch.nn.functional.softmax(x, dim=-1)
+    #         return x

data_utils/ext/synchformer/vit_helper.py ADDED Viewed

	@@ -0,0 +1,399 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright 2020 Ross Wightman
+# Modified Model definition
+"""Video models."""
+import math
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from timm.layers import to_2tuple
+from torch import einsum
+from torch.nn import functional as F
+default_cfgs = {
+    'vit_1k':
+    'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth',
+    'vit_1k_large':
+    'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p16_224-4ee7a4dc.pth',
+}
+def qkv_attn(q, k, v, tok_mask: torch.Tensor = None):
+    sim = einsum('b i d, b j d -> b i j', q, k)
+    # apply masking if provided, tok_mask is (B*S*H, N): 1s - keep; sim is (B*S*H, H, N, N)
+    if tok_mask is not None:
+        BSH, N = tok_mask.shape
+        sim = sim.masked_fill(tok_mask.view(BSH, 1, N) == 0,
+                              float('-inf'))  # 1 - broadcasts across N
+    attn = sim.softmax(dim=-1)
+    out = einsum('b i j, b j d -> b i d', attn, v)
+    return out
+class DividedAttention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        # init to zeros
+        self.qkv.weight.data.fill_(0)
+        self.qkv.bias.data.fill_(0)
+        self.proj.weight.data.fill_(1)
+        self.proj.bias.data.fill_(0)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, einops_from, einops_to, tok_mask: torch.Tensor = None, **einops_dims):
+        # num of heads variable
+        h = self.num_heads
+        # project x to q, k, v vaalues
+        q, k, v = self.qkv(x).chunk(3, dim=-1)
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+        if tok_mask is not None:
+            # replicate token mask across heads (b, n) -> (b, h, n) -> (b*h, n) -- same as qkv but w/o d
+            assert len(tok_mask.shape) == 2
+            tok_mask = tok_mask.unsqueeze(1).expand(-1, h, -1).reshape(-1, tok_mask.shape[1])
+        # Scale q
+        q *= self.scale
+        # Take out cls_q, cls_k, cls_v
+        (cls_q, q_), (cls_k, k_), (cls_v, v_) = map(lambda t: (t[:, 0:1], t[:, 1:]), (q, k, v))
+        # the same for masking
+        if tok_mask is not None:
+            cls_mask, mask_ = tok_mask[:, 0:1], tok_mask[:, 1:]
+        else:
+            cls_mask, mask_ = None, None
+        # let CLS token attend to key / values of all patches across time and space
+        cls_out = qkv_attn(cls_q, k, v, tok_mask=tok_mask)
+        # rearrange across time or space
+        q_, k_, v_ = map(lambda t: rearrange(t, f'{einops_from} -> {einops_to}', **einops_dims),
+                         (q_, k_, v_))
+        # expand CLS token keys and values across time or space and concat
+        r = q_.shape[0] // cls_k.shape[0]
+        cls_k, cls_v = map(lambda t: repeat(t, 'b () d -> (b r) () d', r=r), (cls_k, cls_v))
+        k_ = torch.cat((cls_k, k_), dim=1)
+        v_ = torch.cat((cls_v, v_), dim=1)
+        # the same for masking (if provided)
+        if tok_mask is not None:
+            # since mask does not have the latent dim (d), we need to remove it from einops dims
+            mask_ = rearrange(mask_, f'{einops_from} -> {einops_to}'.replace(' d', ''),
+                              **einops_dims)
+            cls_mask = repeat(cls_mask, 'b () -> (b r) ()',
+                              r=r)  # expand cls_mask across time or space
+            mask_ = torch.cat((cls_mask, mask_), dim=1)
+        # attention
+        out = qkv_attn(q_, k_, v_, tok_mask=mask_)
+        # merge back time or space
+        out = rearrange(out, f'{einops_to} -> {einops_from}', **einops_dims)
+        # concat back the cls token
+        out = torch.cat((cls_out, out), dim=1)
+        # merge back the heads
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+        ## to out
+        x = self.proj(out)
+        x = self.proj_drop(x)
+        return x
+class DividedSpaceTimeBlock(nn.Module):
+    def __init__(self,
+                 dim=768,
+                 num_heads=12,
+                 attn_type='divided',
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.einops_from_space = 'b (f n) d'
+        self.einops_to_space = '(b f) n d'
+        self.einops_from_time = 'b (f n) d'
+        self.einops_to_time = '(b n) f d'
+        self.norm1 = norm_layer(dim)
+        self.attn = DividedAttention(dim,
+                                     num_heads=num_heads,
+                                     qkv_bias=qkv_bias,
+                                     attn_drop=attn_drop,
+                                     proj_drop=drop)
+        self.timeattn = DividedAttention(dim,
+                                         num_heads=num_heads,
+                                         qkv_bias=qkv_bias,
+                                         attn_drop=attn_drop,
+                                         proj_drop=drop)
+        # self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.drop_path = nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        self.norm3 = norm_layer(dim)
+    def forward(self,
+                x,
+                seq_len=196,
+                num_frames=8,
+                approx='none',
+                num_landmarks=128,
+                tok_mask: torch.Tensor = None):
+        time_output = self.timeattn(self.norm3(x),
+                                    self.einops_from_time,
+                                    self.einops_to_time,
+                                    n=seq_len,
+                                    tok_mask=tok_mask)
+        time_residual = x + time_output
+        space_output = self.attn(self.norm1(time_residual),
+                                 self.einops_from_space,
+                                 self.einops_to_space,
+                                 f=num_frames,
+                                 tok_mask=tok_mask)
+        space_residual = time_residual + self.drop_path(space_output)
+        x = space_residual
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class Mlp(nn.Module):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = img_size if type(img_size) is tuple else to_2tuple(img_size)
+        patch_size = img_size if type(patch_size) is tuple else to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class PatchEmbed3D(nn.Module):
+    """ Image to Patch Embedding """
+    def __init__(self,
+                 img_size=224,
+                 temporal_resolution=4,
+                 in_chans=3,
+                 patch_size=16,
+                 z_block_size=2,
+                 embed_dim=768,
+                 flatten=True):
+        super().__init__()
+        self.height = (img_size // patch_size)
+        self.width = (img_size // patch_size)
+        ### v-iashin: these two are incorrect
+        # self.frames = (temporal_resolution // z_block_size)
+        # self.num_patches = self.height * self.width * self.frames
+        self.z_block_size = z_block_size
+        ###
+        self.proj = nn.Conv3d(in_chans,
+                              embed_dim,
+                              kernel_size=(z_block_size, patch_size, patch_size),
+                              stride=(z_block_size, patch_size, patch_size))
+        self.flatten = flatten
+    def forward(self, x):
+        B, C, T, H, W = x.shape
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)
+        return x
+class HeadMLP(nn.Module):
+    def __init__(self, n_input, n_classes, n_hidden=512, p=0.1):
+        super(HeadMLP, self).__init__()
+        self.n_input = n_input
+        self.n_classes = n_classes
+        self.n_hidden = n_hidden
+        if n_hidden is None:
+            # use linear classifier
+            self.block_forward = nn.Sequential(nn.Dropout(p=p),
+                                               nn.Linear(n_input, n_classes, bias=True))
+        else:
+            # use simple MLP classifier
+            self.block_forward = nn.Sequential(nn.Dropout(p=p),
+                                               nn.Linear(n_input, n_hidden, bias=True),
+                                               nn.BatchNorm1d(n_hidden), nn.ReLU(inplace=True),
+                                               nn.Dropout(p=p),
+                                               nn.Linear(n_hidden, n_classes, bias=True))
+        print(f"Dropout-NLP: {p}")
+    def forward(self, x):
+        return self.block_forward(x)
+def _conv_filter(state_dict, patch_size=16):
+    """ convert patch embedding weight from manual patchify + linear proj to conv"""
+    out_dict = {}
+    for k, v in state_dict.items():
+        if 'patch_embed.proj.weight' in k:
+            v = v.reshape((v.shape[0], 3, patch_size, patch_size))
+        out_dict[k] = v
+    return out_dict
+def adapt_input_conv(in_chans, conv_weight, agg='sum'):
+    conv_type = conv_weight.dtype
+    conv_weight = conv_weight.float()
+    O, I, J, K = conv_weight.shape
+    if in_chans == 1:
+        if I > 3:
+            assert conv_weight.shape[1] % 3 == 0
+            # For models with space2depth stems
+            conv_weight = conv_weight.reshape(O, I // 3, 3, J, K)
+            conv_weight = conv_weight.sum(dim=2, keepdim=False)
+        else:
+            if agg == 'sum':
+                print("Summing conv1 weights")
+                conv_weight = conv_weight.sum(dim=1, keepdim=True)
+            else:
+                print("Averaging conv1 weights")
+                conv_weight = conv_weight.mean(dim=1, keepdim=True)
+    elif in_chans != 3:
+        if I != 3:
+            raise NotImplementedError('Weight format not supported by conversion.')
+        else:
+            if agg == 'sum':
+                print("Summing conv1 weights")
+                repeat = int(math.ceil(in_chans / 3))
+                conv_weight = conv_weight.repeat(1, repeat, 1, 1)[:, :in_chans, :, :]
+                conv_weight *= (3 / float(in_chans))
+            else:
+                print("Averaging conv1 weights")
+                conv_weight = conv_weight.mean(dim=1, keepdim=True)
+                conv_weight = conv_weight.repeat(1, in_chans, 1, 1)
+    conv_weight = conv_weight.to(conv_type)
+    return conv_weight
+def load_pretrained(model,
+                    cfg=None,
+                    num_classes=1000,
+                    in_chans=3,
+                    filter_fn=None,
+                    strict=True,
+                    progress=False):
+    # Load state dict
+    assert (f"{cfg.VIT.PRETRAINED_WEIGHTS} not in [vit_1k, vit_1k_large]")
+    state_dict = torch.hub.load_state_dict_from_url(url=default_cfgs[cfg.VIT.PRETRAINED_WEIGHTS])
+    if filter_fn is not None:
+        state_dict = filter_fn(state_dict)
+    input_convs = 'patch_embed.proj'
+    if input_convs is not None and in_chans != 3:
+        if isinstance(input_convs, str):
+            input_convs = (input_convs, )
+        for input_conv_name in input_convs:
+            weight_name = input_conv_name + '.weight'
+            try:
+                state_dict[weight_name] = adapt_input_conv(in_chans,
+                                                           state_dict[weight_name],
+                                                           agg='avg')
+                print(
+                    f'Converted input conv {input_conv_name} pretrained weights from 3 to {in_chans} channel(s)'
+                )
+            except NotImplementedError as e:
+                del state_dict[weight_name]
+                strict = False
+                print(
+                    f'Unable to convert pretrained {input_conv_name} weights, using random init for this layer.'
+                )
+    classifier_name = 'head'
+    label_offset = cfg.get('label_offset', 0)
+    pretrain_classes = 1000
+    if num_classes != pretrain_classes:
+        # completely discard fully connected if model num_classes doesn't match pretrained weights
+        del state_dict[classifier_name + '.weight']
+        del state_dict[classifier_name + '.bias']
+        strict = False
+    elif label_offset > 0:
+        # special case for pretrained weights with an extra background class in pretrained weights
+        classifier_weight = state_dict[classifier_name + '.weight']
+        state_dict[classifier_name + '.weight'] = classifier_weight[label_offset:]
+        classifier_bias = state_dict[classifier_name + '.bias']
+        state_dict[classifier_name + '.bias'] = classifier_bias[label_offset:]
+    loaded_state = state_dict
+    self_state = model.state_dict()
+    all_names = set(self_state.keys())
+    saved_names = set([])
+    for name, param in loaded_state.items():
+        param = param
+        if 'module.' in name:
+            name = name.replace('module.', '')
+        if name in self_state.keys() and param.shape == self_state[name].shape:
+            saved_names.add(name)
+            self_state[name].copy_(param)
+        else:
+            print(f"didnt load: {name} of shape: {param.shape}")
+    print("Missing Keys:")
+    print(all_names - saved_names)

data_utils/v2a_utils/__init__.py ADDED Viewed

File without changes

data_utils/v2a_utils/audio_text_dataset.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import os
+from pathlib import Path
+from typing import Optional, Union
+from PIL import Image
+import pandas as pd
+import torch
+import torchaudio
+from torch.utils.data.dataset import Dataset
+from torchvision.transforms import v2
+from torio.io import StreamingMediaDecoder
+from torchvision.utils import save_image
+from transformers import AutoProcessor
+import torch.nn.functional as F
+import numpy as np
+import logging
+log = logging.getLogger()
+_CLIP_SIZE = 224
+_CLIP_FPS = 8.0
+_SYNC_SIZE = 224
+_SYNC_FPS = 25.0
+class Audio_Text(Dataset):
+    def __init__(
+        self,
+        root: Union[str, Path],
+        *,
+        tsv_path: Union[str, Path] = 'dataset/vggsound/split_txt/train_caption.csv',
+        sample_rate: int = 44_100,
+        duration_sec: float = 9.0,
+        audio_samples: Optional[int] = 397312,
+        normalize_audio: bool = False,
+        start_row: Optional[int] = None,
+        end_row: Optional[int] = None,
+        save_dir: str = 'data/vggsound/video_latents_text/train'
+    ):
+        self.root = Path(root)
+        self.normalize_audio = normalize_audio
+        if audio_samples is None:
+            self.audio_samples = int(sample_rate * duration_sec)
+        else:
+            self.audio_samples = audio_samples
+            effective_duration = audio_samples / sample_rate
+            # make sure the duration is close enough, within 15ms
+            assert abs(effective_duration - duration_sec) < 0.015, \
+                f'audio_samples {audio_samples} does not match duration_sec {duration_sec}'
+        # videos = sorted(os.listdir(self.root))
+        # videos = set([Path(v).stem for v in videos])  # remove extensions
+        videos = []
+        self.labels = []
+        self.videos = []
+        self.cots = []
+        missing_videos = []
+        # read the tsv for subset information
+        df_list = pd.read_csv(tsv_path, sep=',', dtype={'id': str}).to_dict('records')
+        # 控制处理的行范围
+        if start_row is not None and end_row is not None:
+            df_list = df_list[start_row:end_row]
+        for record in df_list:
+            id = record['id']
+            if os.path.exists(f'{save_dir}/{id}.pth'): continue
+            label = record['caption']
+            # if id in videos:
+            self.labels.append(label)
+            # print(label,'debug1!!!!!!!!!')
+            self.cots.append(record['caption_cot'])
+            # self.labels[id] = label
+            self.videos.append(id)
+            # else:
+            #     missing_videos.append(id)
+        log.info(f'{len(videos)} videos found in {root}')
+        log.info(f'{len(self.videos)} videos found in {tsv_path}')
+        log.info(f'{len(missing_videos)} videos missing in {root}')
+        self.sample_rate = sample_rate
+        self.duration_sec = duration_sec
+        self.expected_audio_length = self.audio_samples
+        self.resampler = {}
+    def sample(self, idx: int):
+        video_id = self.videos[idx]
+        label = self.labels[idx]
+        cot = self.cots[idx]
+        audio_path = os.path.join(self.root, f'{video_id}.wav')
+        if not os.path.exists(audio_path):
+            audio_path = os.path.join(self.root, f'{video_id}.flac')
+            if not os.path.exists(audio_path):
+                raise RuntimeError(f'Audio is not exist {audio_path}')
+        audio_chunk, sample_rate = torchaudio.load(audio_path)
+        if len(audio_chunk.shape) != 2:
+            raise RuntimeError(f'error audio shape {video_id}')
+        abs_max = audio_chunk[0].abs().max()
+        if abs_max <= 1e-6:
+            if audio_chunk.shape[0] > 1 and audio_chunk[1].abs().max() > 1e-6:
+                audio_chunk = audio_chunk[1:2]
+            else:
+                raise RuntimeError(f'Audio is silent {video_id}')
+        # ensure the stereo audio
+        if audio_chunk.shape[0] < 2:
+            audio_chunk = audio_chunk.repeat(2, 1)
+        elif audio_chunk.shape[0] > 2:
+            audio_chunk = audio_chunk[:2]
+        # resample
+        if sample_rate == self.sample_rate:
+            audio_chunk = audio_chunk
+        else:
+            if sample_rate not in self.resampler:
+                # https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html#kaiser-best
+                self.resampler[sample_rate] = torchaudio.transforms.Resample(
+                    sample_rate,
+                    self.sample_rate,
+                    lowpass_filter_width=64,
+                    rolloff=0.9475937167399596,
+                    resampling_method='sinc_interp_kaiser',
+                    beta=14.769656459379492,
+                )
+            audio_chunk = self.resampler[sample_rate](audio_chunk)
+        if audio_chunk.shape[1] < self.expected_audio_length:
+            # zero-padding audio
+            padding_length = self.expected_audio_length - audio_chunk.shape[1]
+            # 创建 padding 张量，大小为 [batch_size, padding_length]，值为0
+            padding = torch.zeros(audio_chunk.shape[0], padding_length)
+            # 将原始音频和 padding 沿第 1 维度拼接在一起
+            audio_chunk = torch.cat((audio_chunk, padding), dim=1)
+            # raise RuntimeError(f'Audio too short {video_id}')
+        audio_chunk = audio_chunk[:,:self.expected_audio_length]
+        assert audio_chunk.shape == (2, 397312), f'error shape:{video_id},{audio_chunk.shape}'
+        # print(label,'debug2!!!!!!!!!')
+        data = {
+            'id': video_id,
+            'caption': label,
+            'caption_cot': cot,
+            'audio': audio_chunk,
+        }
+        return data
+    def __getitem__(self, idx: int):
+        try:
+            return self.sample(idx)
+        except Exception as e:
+            log.error(f'Error loading video {self.videos[idx]}: {e}')
+            return None
+    def __len__(self):
+        return len(self.labels)
+# dataset = VGGSound(
+#         root="data/vggsound/video/train",
+#         tsv_path="data/vggsound/split_txt/temp.csv",
+#         sample_rate=44100,
+#         duration_sec=9.0,
+#         audio_samples=397312,
+#         start_row=0,
+#         end_row=None,
+#         save_dir="data/vggsound/video_224_latents_text/train"
+#     )
+# dataset[0]

data_utils/v2a_utils/audioset_224.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import os
+from pathlib import Path
+from typing import Optional, Union
+from PIL import Image
+import pandas as pd
+import torch
+import torchaudio
+from torch.utils.data.dataset import Dataset
+from torchvision.transforms import v2
+from torio.io import StreamingMediaDecoder
+from torchvision.utils import save_image
+from transformers import AutoProcessor
+import torch.nn.functional as F
+import numpy as np
+import logging
+log = logging.getLogger()
+_CLIP_SIZE = 224
+_CLIP_FPS = 8.0
+_SYNC_SIZE = 224
+_SYNC_FPS = 25.0
+def save_tensor_as_image(tensor, save_path):
+    """
+    将形状为 (1, 3, H, W) 的 RGB 图像数组保存为图片文件。
+    :param tensor: 输入的 NumPy 数组 (1, 3, H, W)。
+    :param save_path: 图片保存路径。
+    """
+    # # 移除批次维度，变成 (3, H, W)
+    # tensor = tensor.squeeze(0)
+    # 交换轴顺序，变为 (H, W, 3)
+    image_array = np.transpose(tensor, (1, 2, 0))
+    # 检查数组是否为合适的数据类型
+    if image_array.dtype != np.uint8:
+        # 如果不是 uint8，首先标准化，然后转换
+        image_array = (image_array - image_array.min()) / (image_array.max() - image_array.min()) * 255
+        image_array = image_array.astype(np.uint8)
+    # 创建图像对象
+    image = Image.fromarray(image_array)
+    # 保存图片
+    image.save(save_path)
+    print(f"Image saved to {save_path}")
+def pad_to_square(video_tensor):
+    # 验证输入的形状
+    if len(video_tensor.shape) != 4:
+        raise ValueError("Input tensor must have shape (l, c, h, w)")
+    l, c, h, w = video_tensor.shape
+    max_side = max(h, w)
+    # 计算每一维度需要的填充量：(left, right, top, bottom)
+    pad_h = max_side - h
+    pad_w = max_side - w
+    # 创建padding tuple (left, right, top, bottom)
+    # 因为图像的填充是作用在最后两个维度 h 和 w 上，所以我们需要指定这两个维度的填充
+    padding = (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2)
+    # 使用F.pad对视频张量进行填充操作
+    # 填充参数为 (left, right, top, bottom)
+    video_padded = F.pad(video_tensor, pad=padding, mode='constant', value=0)
+    return video_padded
+class Audioset(Dataset):
+    def __init__(
+        self,
+        root: Union[str, Path],
+        *,
+        tsv_path: Union[str, Path] = 'dataset/vggsound/split_txt/train_caption.csv',
+        sample_rate: int = 44_100,
+        duration_sec: float = 9.0,
+        audio_samples: Optional[int] = 397312,
+        normalize_audio: bool = False,
+        start_row: Optional[int] = None,
+        end_row: Optional[int] = None,
+        save_dir: str = 'data/vggsound/video_latents_text/train'
+    ):
+        self.root = Path(root)
+        self.normalize_audio = normalize_audio
+        if audio_samples is None:
+            self.audio_samples = int(sample_rate * duration_sec)
+        else:
+            self.audio_samples = audio_samples
+            effective_duration = audio_samples / sample_rate
+            # make sure the duration is close enough, within 15ms
+            assert abs(effective_duration - duration_sec) < 0.015, \
+                f'audio_samples {audio_samples} does not match duration_sec {duration_sec}'
+        # videos = sorted(os.listdir(self.root))
+        # videos = set([Path(v).stem for v in videos])  # remove extensions
+        videos = []
+        self.labels = []
+        self.videos = []
+        self.caption_t5s = []
+        missing_videos = []
+        # read the tsv for subset information
+        df_list = pd.read_csv(tsv_path, sep=',', dtype={'id': str}).to_dict('records')
+        # 控制处理的行范围
+        if start_row is not None and end_row is not None:
+            df_list = df_list[start_row:end_row]
+        for record in df_list:
+            id = record['id']
+            if os.path.exists(f'{save_dir}/{id}.pth'): continue
+            label = record['label']
+            caption_t5 = record['caption_t5']
+            # if id in videos:
+            self.labels.append(label)
+            # self.labels[id] = label
+            self.videos.append(id)
+            self.caption_t5s.append(caption_t5)
+            # else:
+            #     missing_videos.append(id)
+        log.info(f'{len(videos)} videos found in {root}')
+        log.info(f'{len(self.videos)} videos found in {tsv_path}')
+        log.info(f'{len(missing_videos)} videos missing in {root}')
+        self.sample_rate = sample_rate
+        self.duration_sec = duration_sec
+        self.expected_audio_length = self.audio_samples
+        self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
+        self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
+        self.clip_transform = v2.Compose([
+            v2.Lambda(pad_to_square),          # 先填充为正方形
+            v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+        ])
+        self.clip_processor = AutoProcessor.from_pretrained("useful_ckpts/metaclip-huge")
+        self.sync_transform = v2.Compose([
+            v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
+            v2.CenterCrop(_SYNC_SIZE),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+        self.resampler = {}
+    def sample(self, idx: int) -> dict[str, torch.Tensor]:
+        video_id = self.videos[idx]
+        label = self.labels[idx]
+        caption_t5 = self.caption_t5s[idx]
+        reader = StreamingMediaDecoder(self.root / (video_id + '.mp4'))
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
+            frame_rate=_CLIP_FPS,
+            format='rgb24',
+        )
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
+            frame_rate=_SYNC_FPS,
+            format='rgb24',
+        )
+        # reader.add_basic_audio_stream(frames_per_chunk=2**30,)
+        reader.fill_buffer()
+        data_chunk = reader.pop_chunks()
+        clip_chunk = data_chunk[0]
+        sync_chunk = data_chunk[1]
+        audio_path = os.path.join("dataset/3_Audioset/audios/sound",video_id+'.wav')
+        assert os.path.exists(audio_path), f'{audio_path} not exists'
+        audio_chunk, sr = torchaudio.load(audio_path)
+        # audio_chunk = data_chunk[2]
+        if len(audio_chunk.shape) != 2:
+            raise RuntimeError(f'error audio shape {video_id}')
+        if clip_chunk is None:
+            raise RuntimeError(f'CLIP video returned None {video_id}')
+        if sync_chunk is None:
+            raise RuntimeError(f'Sync video returned None {video_id}')
+        sample_rate = int(sr)
+        # audio_chunk = audio_chunk.transpose(0, 1)
+        abs_max = audio_chunk[0].abs().max()
+        # audio_chunk = audio_chunk.mean(dim=0)  # mono
+        # if self.normalize_audio:
+        #     abs_max = audio_chunk.abs().max()
+        #     audio_chunk = audio_chunk / abs_max * 0.95
+        if abs_max <= 1e-6:
+            if audio_chunk.shape[0] > 1 and audio_chunk[1].abs().max() > 1e-6:
+                audio_chunk = audio_chunk[1:2]
+            else:
+                raise RuntimeError(f'Audio is silent {video_id}')
+        # ensure the stereo audio
+        if audio_chunk.shape[0] < 2:
+            audio_chunk = audio_chunk.repeat(2, 1)
+        # resample
+        if sample_rate == self.sample_rate:
+            audio_chunk = audio_chunk
+        else:
+            if sample_rate not in self.resampler:
+                # https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html#kaiser-best
+                self.resampler[sample_rate] = torchaudio.transforms.Resample(
+                    sample_rate,
+                    self.sample_rate,
+                    lowpass_filter_width=64,
+                    rolloff=0.9475937167399596,
+                    resampling_method='sinc_interp_kaiser',
+                    beta=14.769656459379492,
+                )
+            audio_chunk = self.resampler[sample_rate](audio_chunk)
+        if audio_chunk.shape[1] < self.expected_audio_length:
+            # zero-padding audio
+            padding_length = self.expected_audio_length - audio_chunk.shape[1]
+            # 创建 padding 张量，大小为 [batch_size, padding_length]，值为0
+            padding = torch.zeros(audio_chunk.shape[0], padding_length)
+            # 将原始音频和 padding 沿第 1 维度拼接在一起
+            audio_chunk = torch.cat((audio_chunk, padding), dim=1)
+            # raise RuntimeError(f'Audio too short {video_id}')
+        audio_chunk = audio_chunk[:,:self.expected_audio_length]
+        # truncate the video
+        clip_chunk = clip_chunk[:self.clip_expected_length]
+        # import ipdb
+        # ipdb.set_trace()
+        if clip_chunk.shape[0] != self.clip_expected_length:
+            current_length = clip_chunk.shape[0]
+            padding_needed = self.clip_expected_length - current_length
+            # Check that padding needed is no more than 2
+            assert padding_needed < 4, f'Padding no more than 2 frames allowed, but {padding_needed} needed'
+            # If assertion passes, proceed with padding
+            if padding_needed > 0:
+                last_frame = clip_chunk[-1]
+                log.info(last_frame.shape)
+                # Repeat the last frame to reach the expected length
+                padding = last_frame.repeat(padding_needed, 1, 1, 1)
+                clip_chunk = torch.cat((clip_chunk, padding), dim=0)
+            # raise RuntimeError(f'CLIP video wrong length {video_id}, '
+            #                    f'expected {self.clip_expected_length}, '
+            #                    f'got {clip_chunk.shape[0]}')
+        # save_image(clip_chunk[0] / 255.0,'ori.png')
+        clip_chunk = pad_to_square(clip_chunk)
+        # save_image(clip_chunk[0] / 255.0,'square.png')
+        # clip_chunk = self.clip_transform(clip_chunk)
+        # import ipdb
+        # ipdb.set_trace()
+        clip_chunk = self.clip_processor(images=clip_chunk, return_tensors="pt")["pixel_values"]
+        # log.info(clip_chunk.shape)
+        # save_tensor_as_image(clip_chunk[0].numpy(),'scale.png')
+        # log.info(clip_chunk[0])
+        # clip_chunk = outputs
+        # text_ids = outputs["input_ids"]
+        # temp_img = clip_chunk[0].permute(1, 2, 0) * 255
+        # save_image(clip_chunk[0],'scale.png')
+        sync_chunk = sync_chunk[:self.sync_expected_length]
+        if sync_chunk.shape[0] != self.sync_expected_length:
+            # padding using the last frame, but no more than 2
+            current_length = sync_chunk.shape[0]
+            last_frame = sync_chunk[-1]
+            # 重复最后一帧以进行填充
+            padding = last_frame.repeat(self.sync_expected_length - current_length, 1, 1, 1)
+            assert self.sync_expected_length - current_length < 12, f'sync can pad no more than 2 while {self.sync_expected_length - current_length}'
+            sync_chunk = torch.cat((sync_chunk, padding), dim=0)
+            # raise RuntimeError(f'Sync video wrong length {video_id}, '
+            #                    f'expected {self.sync_expected_length}, '
+            #                    f'got {sync_chunk.shape[0]}')
+        sync_chunk = self.sync_transform(sync_chunk)
+        assert audio_chunk.shape[1] == self.expected_audio_length and clip_chunk.shape[0] == self.clip_expected_length \
+        and sync_chunk.shape[0] == self.sync_expected_length, 'error processed data shape'
+        data = {
+            'id': video_id,
+            'caption': label,
+            'caption_t5': caption_t5,
+            'audio': audio_chunk,
+            'clip_video': clip_chunk,
+            'sync_video': sync_chunk,
+        }
+        return data
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        try:
+            return self.sample(idx)
+        except Exception as e:
+            log.error(f'Error loading video {self.videos[idx]}: {e}')
+            return None
+    def __len__(self):
+        return len(self.labels)
+# dataset = Audioset(
+#         root="dataset/3_Audioset/video/sound",
+#         tsv_path="dataset/3_Audioset/split_txt/unbalanced_sound_filtered_aligned_novgg_noout.csv",
+#         sample_rate=44100,
+#         duration_sec=9.0,
+#         audio_samples=397312,
+#         start_row=0,
+#         end_row=None,
+#         save_dir="dataset/3_Audioset/video_text_latents/"
+#     )
+# dataset[0]

data_utils/v2a_utils/audioset_video_224.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import os
+from pathlib import Path
+from typing import Optional, Union
+from PIL import Image
+import pandas as pd
+import torch
+import torchaudio
+from torch.utils.data.dataset import Dataset
+from torchvision.transforms import v2
+from torio.io import StreamingMediaDecoder
+from torchvision.utils import save_image
+from transformers import AutoProcessor
+import torch.nn.functional as F
+import numpy as np
+import logging
+log = logging.getLogger()
+_CLIP_SIZE = 224
+_CLIP_FPS = 8.0
+_SYNC_SIZE = 224
+_SYNC_FPS = 25.0
+def save_tensor_as_image(tensor, save_path):
+    """
+    将形状为 (1, 3, H, W) 的 RGB 图像数组保存为图片文件。
+    :param tensor: 输入的 NumPy 数组 (1, 3, H, W)。
+    :param save_path: 图片保存路径。
+    """
+    # # 移除批次维度，变成 (3, H, W)
+    # tensor = tensor.squeeze(0)
+    # 交换轴顺序，变为 (H, W, 3)
+    image_array = np.transpose(tensor, (1, 2, 0))
+    # 检查数组是否为合适的数据类型
+    if image_array.dtype != np.uint8:
+        # 如果不是 uint8，首先标准化，然后转换
+        image_array = (image_array - image_array.min()) / (image_array.max() - image_array.min()) * 255
+        image_array = image_array.astype(np.uint8)
+    # 创建图像对象
+    image = Image.fromarray(image_array)
+    # 保存图片
+    image.save(save_path)
+    print(f"Image saved to {save_path}")
+def pad_to_square(video_tensor):
+    # 验证输入的形状
+    if len(video_tensor.shape) != 4:
+        raise ValueError("Input tensor must have shape (l, c, h, w)")
+    l, c, h, w = video_tensor.shape
+    max_side = max(h, w)
+    # 计算每一维度需要的填充量：(left, right, top, bottom)
+    pad_h = max_side - h
+    pad_w = max_side - w
+    # 创建padding tuple (left, right, top, bottom)
+    # 因为图像的填充是作用在最后两个维度 h 和 w 上，所以我们需要指定这两个维度的填充
+    padding = (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2)
+    # 使用F.pad对视频张量进行填充操作
+    # 填充参数为 (left, right, top, bottom)
+    video_padded = F.pad(video_tensor, pad=padding, mode='constant', value=0)
+    return video_padded
+class Audioset(Dataset):
+    def __init__(
+        self,
+        root: Union[str, Path],
+        *,
+        tsv_path: Union[str, Path] = 'dataset/vggsound/split_txt/train_caption.csv',
+        duration_sec: float = 10.0,
+        start_row: Optional[int] = None,
+        end_row: Optional[int] = None,
+        save_dir: str = 'data/vggsound/video_latents_text/train'
+    ):
+        self.root = Path(root)
+        # videos = sorted(os.listdir(self.root))
+        # videos = set([Path(v).stem for v in videos])  # remove extensions
+        videos = []
+        self.captions = []
+        self.videos = []
+        self.caption_t5s = []
+        missing_videos = []
+        # read the tsv for subset information
+        df_list = pd.read_csv(tsv_path, sep=',', dtype={'id': str}).to_dict('records')
+        # 控制处理的行范围
+        if start_row is not None and end_row is not None:
+            df_list = df_list[start_row:end_row]
+        with open(tsv_path.replace('.csv','.txt')) as file:
+            paths = file.readlines()
+        for record, path in zip(df_list,paths):
+            id = Path(record['id']).stem
+            # if os.path.exists(f'{save_dir}/{id}.pth'): continue
+            caption = record['caption']
+            caption_t5 = record['caption_t5']
+            path = path.strip()
+            part = Path(path).parent
+            video_id = Path(path).stem[1:]
+            video_path = os.path.join('dataset/3_Audioset/video',part,f'{video_id}.mp4')
+            assert os.path.exists(video_path), 'video must exist'
+            # if id in videos:
+            self.captions.append(caption)
+            self.caption_t5s.append(caption_t5)
+            # self.labels[id] = label
+            self.videos.append(video_path)
+            # else:
+            #     missing_videos.append(id)
+        assert len(self.captions) == len(self.caption_t5s) and len(self.captions) == len(self.videos), 'error length'
+        log.info(f'{len(videos)} videos found in {root}')
+        log.info(f'{len(self.videos)} videos found in {tsv_path}')
+        log.info(f'{len(missing_videos)} videos missing in {root}')
+        self.duration_sec = duration_sec
+        self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
+        self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
+        self.clip_transform = v2.Compose([
+            v2.Lambda(pad_to_square),          # 先填充为正方形
+            v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+        ])
+        self.clip_processor = AutoProcessor.from_pretrained("useful_ckpts/metaclip-huge")
+        self.sync_transform = v2.Compose([
+            v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
+            v2.CenterCrop(_SYNC_SIZE),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+        self.resampler = {}
+    def sample(self, idx: int) -> dict[str, torch.Tensor]:
+        video_path = self.videos[idx]
+        video_id = 'Y'+str(Path(video_path).stem)
+        caption = self.captions[idx]
+        caption_t5 = self.caption_t5s[idx]
+        reader = StreamingMediaDecoder(video_path)
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
+            frame_rate=_CLIP_FPS,
+            format='rgb24',
+        )
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
+            frame_rate=_SYNC_FPS,
+            format='rgb24',
+        )
+        reader.fill_buffer()
+        data_chunk = reader.pop_chunks()
+        clip_chunk = data_chunk[0]
+        sync_chunk = data_chunk[1]
+        if clip_chunk is None:
+            raise RuntimeError(f'CLIP video returned None {video_id}')
+        # if clip_chunk.shape[0] < self.clip_expected_length:
+        #     raise RuntimeError(
+        #         f'CLIP video too short {video_id}, expected {self.clip_expected_length}, got {clip_chunk.shape[0]}'
+        #     )
+        if sync_chunk is None:
+            raise RuntimeError(f'Sync video returned None {video_id}')
+        # if sync_chunk.shape[0] < self.sync_expected_length:
+        #     raise RuntimeError(
+        #         f'Sync video too short {video_id}, expected {self.sync_expected_length}, got {sync_chunk.shape[0]}'
+        #     )
+        # truncate the video
+        clip_chunk = clip_chunk[:self.clip_expected_length]
+        # import ipdb
+        # ipdb.set_trace()
+        if clip_chunk.shape[0] != self.clip_expected_length:
+            current_length = clip_chunk.shape[0]
+            padding_needed = self.clip_expected_length - current_length
+            # Check that padding needed is no more than 2
+            assert padding_needed < 4, f'Padding no more than 2 frames allowed, but {padding_needed} needed'
+            # If assertion passes, proceed with padding
+            if padding_needed > 0:
+                last_frame = clip_chunk[-1]
+                log.info(clip_chunk.shape)
+                # Repeat the last frame to reach the expected length
+                padding = last_frame.repeat(padding_needed, 1, 1, 1)
+                clip_chunk = torch.cat((clip_chunk, padding), dim=0)
+            # raise RuntimeError(f'CLIP video wrong length {video_id}, '
+            #                    f'expected {self.clip_expected_length}, '
+            #                    f'got {clip_chunk.shape[0]}')
+        # save_image(clip_chunk[0] / 255.0,'ori.png')
+        clip_chunk = pad_to_square(clip_chunk)
+        # save_image(clip_chunk[0] / 255.0,'square.png')
+        # clip_chunk = self.clip_transform(clip_chunk)
+        # import ipdb
+        # ipdb.set_trace()
+        clip_chunk = self.clip_processor(images=clip_chunk, return_tensors="pt")["pixel_values"]
+        # log.info(clip_chunk.shape)
+        # save_tensor_as_image(clip_chunk[0].numpy(),'scale.png')
+        # log.info(clip_chunk[0])
+        # clip_chunk = outputs
+        # text_ids = outputs["input_ids"]
+        # temp_img = clip_chunk[0].permute(1, 2, 0) * 255
+        # save_image(clip_chunk[0],'scale.png')
+        sync_chunk = sync_chunk[:self.sync_expected_length]
+        if sync_chunk.shape[0] != self.sync_expected_length:
+            # padding using the last frame, but no more than 2
+            current_length = sync_chunk.shape[0]
+            last_frame = sync_chunk[-1]
+            # 重复最后一帧以进行填充
+            padding = last_frame.repeat(self.sync_expected_length - current_length, 1, 1, 1)
+            assert self.sync_expected_length - current_length < 12, f'sync can pad no more than 2 while {self.sync_expected_length - current_length}'
+            sync_chunk = torch.cat((sync_chunk, padding), dim=0)
+            # raise RuntimeError(f'Sync video wrong length {video_id}, '
+            #                    f'expected {self.sync_expected_length}, '
+            #                    f'got {sync_chunk.shape[0]}')
+        sync_chunk = self.sync_transform(sync_chunk)
+        assert clip_chunk.shape[0] == self.clip_expected_length and sync_chunk.shape[0] == self.sync_expected_length, 'error processed data shape'
+        data = {
+            'id': video_id,
+            'caption': caption,
+            'caption_t5': caption_t5,
+            'clip_video': clip_chunk,
+            'sync_video': sync_chunk,
+        }
+        return data
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        try:
+            return self.sample(idx)
+        except Exception as e:
+            log.error(f'Error loading video {self.videos[idx]}: {e}')
+            return None
+    def __len__(self):
+        return len(self.captions)
+# dataset = VGGSound(
+#         root="data/vggsound/video/train",
+#         tsv_path="data/vggsound/split_txt/temp.csv",
+#         sample_rate=44100,
+#         duration_sec=9.0,
+#         audio_samples=397312,
+#         start_row=0,
+#         end_row=None,
+#         save_dir="data/vggsound/video_224_latents_text/train"
+#     )
+# dataset[0]

data_utils/v2a_utils/feature_utils_224.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from typing import Literal, Optional
+import json
+import open_clip
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from open_clip import create_model_from_pretrained
+from torchvision.transforms import Normalize
+from ThinkSound.models.factory import create_model_from_config
+from ThinkSound.models.utils import load_ckpt_state_dict
+from ThinkSound.training.utils import copy_state_dict
+from transformers import AutoModel
+from transformers import AutoProcessor
+from transformers import T5EncoderModel, AutoTokenizer
+import logging
+from data_utils.ext.synchformer import Synchformer
+log = logging.getLogger()
+def patch_clip(clip_model):
+    # a hack to make it output last hidden states
+    # https://github.com/mlfoundations/open_clip/blob/fc5a37b72d705f760ebbc7915b84729816ed471f/src/open_clip/model.py#L269
+    def new_get_text_features(self, input_ids=None, attention_mask=None, position_ids=None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = text_outputs[0]
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+        return text_features, last_hidden_state
+    clip_model.get_text_features = new_get_text_features.__get__(clip_model)
+    return clip_model
+class FeaturesUtils(nn.Module):
+    def __init__(
+        self,
+        *,
+        vae_ckpt: Optional[str] = None,
+        vae_config: Optional[str] = None,
+        synchformer_ckpt: Optional[str] = None,
+        enable_conditions: bool = True,
+        need_vae_encoder: bool = True,
+    ):
+        super().__init__()
+        if enable_conditions:
+            self.clip_model = AutoModel.from_pretrained("facebook/metaclip-h14-fullcc2.5b")
+            self.clip_model = patch_clip(self.clip_model)
+            self.t5_tokenizer = AutoTokenizer.from_pretrained("google/t5-v1_1-xl")
+            self.t5_model = T5EncoderModel.from_pretrained("google/t5-v1_1-xl")
+            self.clip_processor = AutoProcessor.from_pretrained("facebook/metaclip-h14-fullcc2.5b")
+            # self.clip_preprocess = Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
+            #                                  std=[0.26862954, 0.26130258, 0.27577711])
+            self.synchformer = Synchformer()
+            self.synchformer.load_state_dict(
+                torch.load(synchformer_ckpt, weights_only=True, map_location='cpu'))
+            # self.tokenizer = open_clip.get_tokenizer('ViT-H-14-378-quickgelu')  # same as 'ViT-H-14'
+        else:
+            self.clip_model = None
+            self.synchformer = None
+            self.tokenizer = None
+        if vae_ckpt is not None:
+            with open(vae_config) as f:
+                vae_config = json.load(f)
+            self.vae = create_model_from_config(vae_config)
+            print(f"Loading model checkpoint from {vae_ckpt}")
+            # Load checkpoint
+            copy_state_dict(self.vae, load_ckpt_state_dict(vae_ckpt,prefix='autoencoder.'))#,prefix='autoencoder.'
+        else:
+            self.tod = None
+    def compile(self):
+        if self.clip_model is not None:
+            self.clip_model.encode_image = torch.compile(self.clip_model.encode_image)
+            self.clip_model.encode_text = torch.compile(self.clip_model.encode_text)
+        if self.synchformer is not None:
+            self.synchformer = torch.compile(self.synchformer)
+    def train(self, mode: bool) -> None:
+        return super().train(False)
+    @torch.inference_mode()
+    def encode_video_with_clip(self, x: torch.Tensor, batch_size: int = -1) -> torch.Tensor:
+        assert self.clip_model is not None, 'CLIP is not loaded'
+        # x: (B, T, C, H, W) H/W: 384
+        b, t, c, h, w = x.shape
+        assert c == 3 and h == 224 and w == 224
+        # x = self.clip_preprocess(x)
+        x = rearrange(x, 'b t c h w -> (b t) c h w')
+        outputs = []
+        if batch_size < 0:
+            batch_size = b * t
+        for i in range(0, b * t, batch_size):
+            outputs.append(self.clip_model.get_image_features(x[i:i + batch_size]))
+        x = torch.cat(outputs, dim=0)
+        # x = self.clip_model.encode_image(x, normalize=True)
+        x = rearrange(x, '(b t) d -> b t d', b=b)
+        return x
+    @torch.inference_mode()
+    def encode_video_with_sync(self, x: torch.Tensor, batch_size: int = -1) -> torch.Tensor:
+        assert self.synchformer is not None, 'Synchformer is not loaded'
+        # x: (B, T, C, H, W) H/W: 384
+        b, t, c, h, w = x.shape
+        # import ipdb
+        # ipdb.set_trace()
+        assert c == 3 and h == 224 and w == 224
+        # partition the video
+        segment_size = 16
+        step_size = 8
+        num_segments = (t - segment_size) // step_size + 1
+        segments = []
+        for i in range(num_segments):
+            segments.append(x[:, i * step_size:i * step_size + segment_size])
+        x = torch.stack(segments, dim=1)  # (B, S, T, C, H, W)
+        outputs = []
+        if batch_size < 0:
+            batch_size = b
+        x = rearrange(x, 'b s t c h w -> (b s) 1 t c h w')
+        for i in range(0, b * num_segments, batch_size):
+            outputs.append(self.synchformer(x[i:i + batch_size]))
+        x = torch.cat(outputs, dim=0)
+        x = rearrange(x, '(b s) 1 t d -> b (s t) d', b=b)
+        return x
+    @torch.inference_mode()
+    def encode_text(self, text: list[str]) -> torch.Tensor:
+        assert self.clip_model is not None, 'CLIP is not loaded'
+        # assert self.tokenizer is not None, 'Tokenizer is not loaded'
+        # x: (B, L)
+        tokens = self.clip_processor(text=text, truncation=True, max_length=77, padding="max_length",return_tensors="pt").to(self.device)
+        return self.clip_model.get_text_features(**tokens)
+    @torch.inference_mode()
+    def encode_t5_text(self, text: list[str]) -> torch.Tensor:
+        assert self.t5_model is not None, 'T5 model is not loaded'
+        assert self.t5_tokenizer is not None, 'T5 Tokenizer is not loaded'
+        # x: (B, L)
+        inputs = self.t5_tokenizer(text,
+            truncation=True,
+            max_length=77,
+            padding="max_length",
+            return_tensors="pt").to(self.device)
+        return self.t5_model(**inputs).last_hidden_state
+    @torch.inference_mode()
+    def encode_audio(self, x) -> torch.Tensor:
+        x = self.vae.encode(x)
+        return x
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype

data_utils/v2a_utils/vggsound.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import logging
+import os
+from pathlib import Path
+from typing import Optional, Union
+import pandas as pd
+import torch
+import torchaudio
+from torch.utils.data.dataset import Dataset
+from torchvision.transforms import v2
+from torio.io import StreamingMediaDecoder
+from torchvision.utils import save_image
+log = logging.getLogger()
+_CLIP_SIZE = 384
+_CLIP_FPS = 8.0
+_SYNC_SIZE = 224
+_SYNC_FPS = 25.0
+class VGGSound(Dataset):
+    def __init__(
+        self,
+        root: Union[str, Path],
+        *,
+        tsv_path: Union[str, Path] = 'dataset/vggsound/split_txt/train_caption.csv',
+        sample_rate: int = 44_100,
+        duration_sec: float = 9.0,
+        audio_samples: Optional[int] = 397312,
+        normalize_audio: bool = False,
+        start_row: Optional[int] = None,
+        end_row: Optional[int] = None,
+        save_dir: str = 'data/vggsound/video_latents_text/train'
+    ):
+        self.root = Path(root)
+        self.normalize_audio = normalize_audio
+        if audio_samples is None:
+            self.audio_samples = int(sample_rate * duration_sec)
+        else:
+            self.audio_samples = audio_samples
+            effective_duration = audio_samples / sample_rate
+            # make sure the duration is close enough, within 15ms
+            assert abs(effective_duration - duration_sec) < 0.015, \
+                f'audio_samples {audio_samples} does not match duration_sec {duration_sec}'
+        videos = sorted(os.listdir(self.root))
+        videos = set([Path(v).stem for v in videos])  # remove extensions
+        # videos = []
+        self.labels = []
+        self.videos = []
+        missing_videos = []
+        # read the tsv for subset information
+        df_list = pd.read_csv(tsv_path, sep=',', dtype={'id': str}).to_dict('records')
+        # 控制处理的行范围
+        if start_row is not None and end_row is not None:
+            df_list = df_list[start_row:end_row]
+        for record in df_list:
+            id = record['id']
+            if os.path.exists(f'{save_dir}/{id}.pth'): continue
+            label = record['caption']
+            if id in videos:
+                # self.labels.append(label)
+                self.labels[id] = label
+                self.videos.append(id)
+            else:
+                missing_videos.append(id)
+        log.info(f'{len(videos)} videos found in {root}')
+        log.info(f'{len(self.videos)} videos found in {tsv_path}')
+        log.info(f'{len(missing_videos)} videos missing in {root}')
+        self.sample_rate = sample_rate
+        self.duration_sec = duration_sec
+        self.expected_audio_length = self.audio_samples
+        self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
+        self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
+        self.clip_transform = v2.Compose([
+            v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+        ])
+        self.sync_transform = v2.Compose([
+            v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
+            v2.CenterCrop(_SYNC_SIZE),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+        self.resampler = {}
+    def sample(self, idx: int) -> dict[str, torch.Tensor]:
+        video_id = self.videos[idx]
+        label = self.labels[idx]
+        reader = StreamingMediaDecoder(self.root / (video_id + '.mp4'))
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
+            frame_rate=_CLIP_FPS,
+            format='rgb24',
+        )
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
+            frame_rate=_SYNC_FPS,
+            format='rgb24',
+        )
+        reader.add_basic_audio_stream(frames_per_chunk=2**30,)
+        reader.fill_buffer()
+        data_chunk = reader.pop_chunks()
+        clip_chunk = data_chunk[0]
+        sync_chunk = data_chunk[1]
+        audio_chunk = data_chunk[2]
+        if len(audio_chunk.shape) != 2:
+            raise RuntimeError(f'error audio shape {video_id}')
+        if clip_chunk is None:
+            raise RuntimeError(f'CLIP video returned None {video_id}')
+        # if clip_chunk.shape[0] < self.clip_expected_length:
+        #     raise RuntimeError(
+        #         f'CLIP video too short {video_id}, expected {self.clip_expected_length}, got {clip_chunk.shape[0]}'
+        #     )
+        if sync_chunk is None:
+            raise RuntimeError(f'Sync video returned None {video_id}')
+        # if sync_chunk.shape[0] < self.sync_expected_length:
+        #     raise RuntimeError(
+        #         f'Sync video too short {video_id}, expected {self.sync_expected_length}, got {sync_chunk.shape[0]}'
+        #     )
+        # import ipdb
+        # ipdb.set_trace()
+        # process audio
+        sample_rate = int(reader.get_out_stream_info(2).sample_rate)
+        audio_chunk = audio_chunk.transpose(0, 1)
+        abs_max = audio_chunk[0].abs().max()
+        # audio_chunk = audio_chunk.mean(dim=0)  # mono
+        # if self.normalize_audio:
+        #     abs_max = audio_chunk.abs().max()
+        #     audio_chunk = audio_chunk / abs_max * 0.95
+        if abs_max <= 1e-6:
+            if audio_chunk.shape[0] > 1 and audio_chunk[1].abs().max() > 1e-6:
+                audio_chunk = audio_chunk[1:2]
+            else:
+                raise RuntimeError(f'Audio is silent {video_id}')
+        #     if abs_max <= 1e-6:
+        #         raise RuntimeError(f'Audio is silent {video_id}')
+        # ensure the stereo audio
+        if audio_chunk.shape[0] < 2:
+            audio_chunk = audio_chunk.repeat(2, 1)
+        # resample
+        if sample_rate == self.sample_rate:
+            audio_chunk = audio_chunk
+        else:
+            if sample_rate not in self.resampler:
+                # https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html#kaiser-best
+                self.resampler[sample_rate] = torchaudio.transforms.Resample(
+                    sample_rate,
+                    self.sample_rate,
+                    lowpass_filter_width=64,
+                    rolloff=0.9475937167399596,
+                    resampling_method='sinc_interp_kaiser',
+                    beta=14.769656459379492,
+                )
+            audio_chunk = self.resampler[sample_rate](audio_chunk)
+        if audio_chunk.shape[1] < self.expected_audio_length:
+            # zero-padding audio
+            padding_length = self.expected_audio_length - audio_chunk.shape[1]
+            # 创建 padding 张量，大小为 [batch_size, padding_length]，值为0
+            padding = torch.zeros(audio_chunk.shape[0], padding_length)
+            # 将原始音频和 padding 沿第 1 维度拼接在一起
+            audio_chunk = torch.cat((audio_chunk, padding), dim=1)
+            # raise RuntimeError(f'Audio too short {video_id}')
+        audio_chunk = audio_chunk[:,:self.expected_audio_length]
+        # truncate the video
+        clip_chunk = clip_chunk[:self.clip_expected_length]
+        # import ipdb
+        # ipdb.set_trace()
+        if clip_chunk.shape[0] != self.clip_expected_length:
+            current_length = clip_chunk.shape[0]
+            padding_needed = self.clip_expected_length - current_length
+            # Check that padding needed is no more than 2
+            assert padding_needed < 4, f'Padding no more than 2 frames allowed, but {padding_needed} needed'
+            # If assertion passes, proceed with padding
+            if padding_needed > 0:
+                last_frame = clip_chunk[-1]
+                log.info(last_frame.shape)
+                # Repeat the last frame to reach the expected length
+                padding = last_frame.repeat(padding_needed, 1, 1, 1)
+                clip_chunk = torch.cat((clip_chunk, padding), dim=0)
+            # raise RuntimeError(f'CLIP video wrong length {video_id}, '
+            #                    f'expected {self.clip_expected_length}, '
+            #                    f'got {clip_chunk.shape[0]}')
+        # save_image(clip_chunk[0] / 255.0,'ori.png')
+        clip_chunk = self.clip_transform(clip_chunk)
+        # temp_img = clip_chunk[0].permute(1, 2, 0) * 255
+        # save_image(clip_chunk[0],'scale.png')
+        sync_chunk = sync_chunk[:self.sync_expected_length]
+        if sync_chunk.shape[0] != self.sync_expected_length:
+            # padding using the last frame, but no more than 2
+            current_length = sync_chunk.shape[0]
+            last_frame = sync_chunk[-1]
+            # 重复最后一帧以进行填充
+            padding = last_frame.repeat(self.sync_expected_length - current_length, 1, 1, 1)
+            assert self.sync_expected_length - current_length < 12, f'sync can pad no more than 2 while {self.sync_expected_length - current_length}'
+            sync_chunk = torch.cat((sync_chunk, padding), dim=0)
+            # raise RuntimeError(f'Sync video wrong length {video_id}, '
+            #                    f'expected {self.sync_expected_length}, '
+            #                    f'got {sync_chunk.shape[0]}')
+        sync_chunk = self.sync_transform(sync_chunk)
+        assert audio_chunk.shape[1] == self.expected_audio_length and clip_chunk.shape[0] == self.clip_expected_length \
+        and sync_chunk.shape[0] == self.sync_expected_length, 'error processed data shape'
+        data = {
+            'id': video_id,
+            'caption': label,
+            'audio': audio_chunk,
+            'clip_video': clip_chunk,
+            'sync_video': sync_chunk,
+        }
+        return data
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        try:
+            return self.sample(idx)
+        except Exception as e:
+            log.error(f'Error loading video {self.videos[idx]}: {e}')
+            return None
+    def __len__(self):
+        return len(self.labels)
+# dataset = VGGSound(
+#         root="data/vggsound/video/test",
+#         tsv_path="data/vggsound/split_txt/temp.csv",
+#         sample_rate=44100,
+#         duration_sec=9.0,
+#         audio_samples=397312,
+#         start_row=0,
+#         end_row=None,
+#         save_dir="data/vggsound/video_latents_text/test"
+#     )
+# dataset[0]

data_utils/v2a_utils/vggsound_224.py ADDED Viewed

	@@ -0,0 +1,320 @@

+import os
+from pathlib import Path
+from typing import Optional, Union
+from PIL import Image
+import pandas as pd
+import torch
+import torchaudio
+from torch.utils.data.dataset import Dataset
+from torchvision.transforms import v2
+from torio.io import StreamingMediaDecoder
+from torchvision.utils import save_image
+from transformers import AutoProcessor
+import torch.nn.functional as F
+import numpy as np
+import logging
+log = logging.getLogger()
+_CLIP_SIZE = 224
+_CLIP_FPS = 8.0
+_SYNC_SIZE = 224
+_SYNC_FPS = 25.0
+def save_tensor_as_image(tensor, save_path):
+    """
+    将形状为 (1, 3, H, W) 的 RGB 图像数组保存为图片文件。
+    :param tensor: 输入的 NumPy 数组 (1, 3, H, W)。
+    :param save_path: 图片保存路径。
+    """
+    # # 移除批次维度，变成 (3, H, W)
+    # tensor = tensor.squeeze(0)
+    # 交换轴顺序，变为 (H, W, 3)
+    image_array = np.transpose(tensor, (1, 2, 0))
+    # 检查数组是否为合适的数据类型
+    if image_array.dtype != np.uint8:
+        # 如果不是 uint8，首先标准化，然后转换
+        image_array = (image_array - image_array.min()) / (image_array.max() - image_array.min()) * 255
+        image_array = image_array.astype(np.uint8)
+    # 创建图像对象
+    image = Image.fromarray(image_array)
+    # 保存图片
+    image.save(save_path)
+    print(f"Image saved to {save_path}")
+def pad_to_square(video_tensor):
+    # 验证输入的形状
+    if len(video_tensor.shape) != 4:
+        raise ValueError("Input tensor must have shape (l, c, h, w)")
+    l, c, h, w = video_tensor.shape
+    max_side = max(h, w)
+    # 计算每一维度需要的填充量：(left, right, top, bottom)
+    pad_h = max_side - h
+    pad_w = max_side - w
+    # 创建padding tuple (left, right, top, bottom)
+    # 因为图像的填充是作用在最后两个维度 h 和 w 上，所以我们需要指定这两个维度的填充
+    padding = (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2)
+    # 使用F.pad对视频张量进行填充操作
+    # 填充参数为 (left, right, top, bottom)
+    video_padded = F.pad(video_tensor, pad=padding, mode='constant', value=0)
+    return video_padded
+class VGGSound(Dataset):
+    def __init__(
+        self,
+        root: Union[str, Path],
+        *,
+        tsv_path: Union[str, Path] = 'dataset/vggsound/split_txt/train_caption.csv',
+        sample_rate: int = 44_100,
+        duration_sec: float = 9.0,
+        audio_samples: Optional[int] = 397312,
+        normalize_audio: bool = False,
+        start_row: Optional[int] = None,
+        end_row: Optional[int] = None,
+        save_dir: str = 'data/vggsound/video_latents_text/train'
+    ):
+        self.root = Path(root)
+        self.normalize_audio = normalize_audio
+        if audio_samples is None:
+            self.audio_samples = int(sample_rate * duration_sec)
+        else:
+            self.audio_samples = audio_samples
+            effective_duration = audio_samples / sample_rate
+            # make sure the duration is close enough, within 15ms
+            assert abs(effective_duration - duration_sec) < 0.015, \
+                f'audio_samples {audio_samples} does not match duration_sec {duration_sec}'
+        # videos = sorted(os.listdir(self.root))
+        # videos = set([Path(v).stem for v in videos])  # remove extensions
+        videos = []
+        self.labels = []
+        self.videos = []
+        missing_videos = []
+        # read the tsv for subset information
+        df_list = pd.read_csv(tsv_path, sep=',', dtype={'id': str}).to_dict('records')
+        # 控制处理的行范围
+        if start_row is not None and end_row is not None:
+            df_list = df_list[start_row:end_row]
+        for record in df_list:
+            id = record['id']
+            if os.path.exists(f'{save_dir}/{id}.pth'): continue
+            label = record['label']
+            # if id in videos:
+            self.labels.append(label)
+            # self.labels[id] = label
+            self.videos.append(id)
+            # else:
+            #     missing_videos.append(id)
+        log.info(f'{len(videos)} videos found in {root}')
+        log.info(f'{len(self.videos)} videos found in {tsv_path}')
+        log.info(f'{len(missing_videos)} videos missing in {root}')
+        self.sample_rate = sample_rate
+        self.duration_sec = duration_sec
+        self.expected_audio_length = self.audio_samples
+        self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
+        self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
+        self.clip_transform = v2.Compose([
+            v2.Lambda(pad_to_square),          # 先填充为正方形
+            v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+        ])
+        self.clip_processor = AutoProcessor.from_pretrained("facebook/metaclip-h14-fullcc2.5b")
+        self.sync_transform = v2.Compose([
+            v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
+            v2.CenterCrop(_SYNC_SIZE),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+        self.resampler = {}
+    def sample(self, idx: int) -> dict[str, torch.Tensor]:
+        video_id = self.videos[idx]
+        label = self.labels[idx]
+        reader = StreamingMediaDecoder(self.root / (video_id + '.mp4'))
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
+            frame_rate=_CLIP_FPS,
+            format='rgb24',
+        )
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
+            frame_rate=_SYNC_FPS,
+            format='rgb24',
+        )
+        reader.add_basic_audio_stream(frames_per_chunk=2**30,)
+        reader.fill_buffer()
+        data_chunk = reader.pop_chunks()
+        clip_chunk = data_chunk[0]
+        sync_chunk = data_chunk[1]
+        audio_chunk = data_chunk[2]
+        if len(audio_chunk.shape) != 2:
+            raise RuntimeError(f'error audio shape {video_id}')
+        if clip_chunk is None:
+            raise RuntimeError(f'CLIP video returned None {video_id}')
+        # if clip_chunk.shape[0] < self.clip_expected_length:
+        #     raise RuntimeError(
+        #         f'CLIP video too short {video_id}, expected {self.clip_expected_length}, got {clip_chunk.shape[0]}'
+        #     )
+        if sync_chunk is None:
+            raise RuntimeError(f'Sync video returned None {video_id}')
+        # if sync_chunk.shape[0] < self.sync_expected_length:
+        #     raise RuntimeError(
+        #         f'Sync video too short {video_id}, expected {self.sync_expected_length}, got {sync_chunk.shape[0]}'
+        #     )
+        # import ipdb
+        # ipdb.set_trace()
+        # process audio
+        # import ipdb
+        # ipdb.set_trace()
+        sample_rate = int(reader.get_out_stream_info(2).sample_rate)
+        audio_chunk = audio_chunk.transpose(0, 1)
+        abs_max = audio_chunk[0].abs().max()
+        # audio_chunk = audio_chunk.mean(dim=0)  # mono
+        # if self.normalize_audio:
+        #     abs_max = audio_chunk.abs().max()
+        #     audio_chunk = audio_chunk / abs_max * 0.95
+        if abs_max <= 1e-6:
+            if audio_chunk.shape[0] > 1 and audio_chunk[1].abs().max() > 1e-6:
+                audio_chunk = audio_chunk[1:2]
+            else:
+                raise RuntimeError(f'Audio is silent {video_id}')
+        # ensure the stereo audio
+        if audio_chunk.shape[0] < 2:
+            audio_chunk = audio_chunk.repeat(2, 1)
+        # resample
+        if sample_rate == self.sample_rate:
+            audio_chunk = audio_chunk
+        else:
+            if sample_rate not in self.resampler:
+                # https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html#kaiser-best
+                self.resampler[sample_rate] = torchaudio.transforms.Resample(
+                    sample_rate,
+                    self.sample_rate,
+                    lowpass_filter_width=64,
+                    rolloff=0.9475937167399596,
+                    resampling_method='sinc_interp_kaiser',
+                    beta=14.769656459379492,
+                )
+            audio_chunk = self.resampler[sample_rate](audio_chunk)
+        if audio_chunk.shape[1] < self.expected_audio_length:
+            # zero-padding audio
+            padding_length = self.expected_audio_length - audio_chunk.shape[1]
+            # 创建 padding 张量，大小为 [batch_size, padding_length]，值为0
+            padding = torch.zeros(audio_chunk.shape[0], padding_length)
+            # 将原始音频和 padding 沿第 1 维度拼接在一起
+            audio_chunk = torch.cat((audio_chunk, padding), dim=1)
+            # raise RuntimeError(f'Audio too short {video_id}')
+        audio_chunk = audio_chunk[:,:self.expected_audio_length]
+        # truncate the video
+        clip_chunk = clip_chunk[:self.clip_expected_length]
+        # import ipdb
+        # ipdb.set_trace()
+        if clip_chunk.shape[0] != self.clip_expected_length:
+            current_length = clip_chunk.shape[0]
+            padding_needed = self.clip_expected_length - current_length
+            # Check that padding needed is no more than 2
+            assert padding_needed < 4, f'Padding no more than 2 frames allowed, but {padding_needed} needed'
+            # If assertion passes, proceed with padding
+            if padding_needed > 0:
+                last_frame = clip_chunk[-1]
+                log.info(last_frame.shape)
+                # Repeat the last frame to reach the expected length
+                padding = last_frame.repeat(padding_needed, 1, 1, 1)
+                clip_chunk = torch.cat((clip_chunk, padding), dim=0)
+            # raise RuntimeError(f'CLIP video wrong length {video_id}, '
+            #                    f'expected {self.clip_expected_length}, '
+            #                    f'got {clip_chunk.shape[0]}')
+        # save_image(clip_chunk[0] / 255.0,'ori.png')
+        clip_chunk = pad_to_square(clip_chunk)
+        # save_image(clip_chunk[0] / 255.0,'square.png')
+        # clip_chunk = self.clip_transform(clip_chunk)
+        # import ipdb
+        # ipdb.set_trace()
+        clip_chunk = self.clip_processor(images=clip_chunk, return_tensors="pt")["pixel_values"]
+        # log.info(clip_chunk.shape)
+        # save_tensor_as_image(clip_chunk[0].numpy(),'scale.png')
+        # log.info(clip_chunk[0])
+        # clip_chunk = outputs
+        # text_ids = outputs["input_ids"]
+        # temp_img = clip_chunk[0].permute(1, 2, 0) * 255
+        # save_image(clip_chunk[0],'scale.png')
+        sync_chunk = sync_chunk[:self.sync_expected_length]
+        if sync_chunk.shape[0] != self.sync_expected_length:
+            # padding using the last frame, but no more than 2
+            current_length = sync_chunk.shape[0]
+            last_frame = sync_chunk[-1]
+            # 重复最后一帧以进行填充
+            padding = last_frame.repeat(self.sync_expected_length - current_length, 1, 1, 1)
+            assert self.sync_expected_length - current_length < 12, f'sync can pad no more than 2 while {self.sync_expected_length - current_length}'
+            sync_chunk = torch.cat((sync_chunk, padding), dim=0)
+            # raise RuntimeError(f'Sync video wrong length {video_id}, '
+            #                    f'expected {self.sync_expected_length}, '
+            #                    f'got {sync_chunk.shape[0]}')
+        sync_chunk = self.sync_transform(sync_chunk)
+        assert audio_chunk.shape[1] == self.expected_audio_length and clip_chunk.shape[0] == self.clip_expected_length \
+        and sync_chunk.shape[0] == self.sync_expected_length, 'error processed data shape'
+        data = {
+            'id': video_id,
+            'caption': label,
+            'audio': audio_chunk,
+            'clip_video': clip_chunk,
+            'sync_video': sync_chunk,
+        }
+        return data
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        try:
+            return self.sample(idx)
+        except Exception as e:
+            log.error(f'Error loading video {self.videos[idx]}: {e}')
+            return None
+    def __len__(self):
+        return len(self.labels)
+# dataset = VGGSound(
+#         root="data/vggsound/video/train",
+#         tsv_path="data/vggsound/split_txt/temp.csv",
+#         sample_rate=44100,
+#         duration_sec=9.0,
+#         audio_samples=397312,
+#         start_row=0,
+#         end_row=None,
+#         save_dir="data/vggsound/video_224_latents_text/train"
+#     )
+# dataset[0]

data_utils/v2a_utils/vggsound_224_no_audio.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import os
+from pathlib import Path
+from typing import Optional, Union
+from PIL import Image
+import pandas as pd
+import torch
+import torchaudio
+from torch.utils.data.dataset import Dataset
+from torchvision.transforms import v2
+from torio.io import StreamingMediaDecoder
+from torchvision.utils import save_image
+from transformers import AutoProcessor
+import torch.nn.functional as F
+import numpy as np
+import logging
+log = logging.getLogger()
+_CLIP_SIZE = 224
+_CLIP_FPS = 8.0
+_SYNC_SIZE = 224
+_SYNC_FPS = 25.0
+def save_tensor_as_image(tensor, save_path):
+    """
+    将形状为 (1, 3, H, W) 的 RGB 图像数组保存为图片文件。
+    :param tensor: 输入的 NumPy 数组 (1, 3, H, W)。
+    :param save_path: 图片保存路径。
+    """
+    # # 移除批次维度，变成 (3, H, W)
+    # tensor = tensor.squeeze(0)
+    # 交换轴顺序，变为 (H, W, 3)
+    image_array = np.transpose(tensor, (1, 2, 0))
+    # 检查数组是否为合适的数据类型
+    if image_array.dtype != np.uint8:
+        # 如果不是 uint8，首先标准化，然后转换
+        image_array = (image_array - image_array.min()) / (image_array.max() - image_array.min()) * 255
+        image_array = image_array.astype(np.uint8)
+    # 创建图像对象
+    image = Image.fromarray(image_array)
+    # 保存图片
+    image.save(save_path)
+    print(f"Image saved to {save_path}")
+def pad_to_square(video_tensor):
+    # 验证输入的形状
+    if len(video_tensor.shape) != 4:
+        raise ValueError("Input tensor must have shape (l, c, h, w)")
+    l, c, h, w = video_tensor.shape
+    max_side = max(h, w)
+    # 计算每一维度需要的填充量：(left, right, top, bottom)
+    pad_h = max_side - h
+    pad_w = max_side - w
+    # 创建padding tuple (left, right, top, bottom)
+    # 因为图像的填充是作用在最后两个维度 h 和 w 上，所以我们需要指定这两个维度的填充
+    padding = (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2)
+    # 使用F.pad对视频张量进行填充操作
+    # 填充参数为 (left, right, top, bottom)
+    video_padded = F.pad(video_tensor, pad=padding, mode='constant', value=0)
+    return video_padded
+class VGGSound(Dataset):
+    def __init__(
+        self,
+        root: Union[str, Path],
+        *,
+        tsv_path: Union[str, Path] = 'dataset/vggsound/split_txt/train_caption.csv',
+        sample_rate: int = 44_100,
+        duration_sec: float = 9.0,
+        audio_samples: Optional[int] = 397312,
+        normalize_audio: bool = False,
+        start_row: Optional[int] = None,
+        end_row: Optional[int] = None,
+        save_dir: str = 'data/vggsound/video_latents_text/train'
+    ):
+        self.root = Path(root)
+        self.normalize_audio = normalize_audio
+        if audio_samples is None:
+            self.audio_samples = int(sample_rate * duration_sec)
+        else:
+            self.audio_samples = audio_samples
+            effective_duration = audio_samples / sample_rate
+            # make sure the duration is close enough, within 15ms
+            assert abs(effective_duration - duration_sec) < 0.015, \
+                f'audio_samples {audio_samples} does not match duration_sec {duration_sec}'
+        # videos = sorted(os.listdir(self.root))
+        # videos = set([Path(v).stem for v in videos])  # remove extensions
+        videos = []
+        self.labels = []
+        self.videos = []
+        self.caption_cot = []
+        missing_videos = []
+        # read the tsv for subset information
+        df_list = pd.read_csv(tsv_path, sep=',', dtype={'id': str}).to_dict('records')
+        # 控制处理的行范围
+        if start_row is not None and end_row is not None:
+            df_list = df_list[start_row:end_row]
+        for record in df_list:
+            id = record['id']
+            if os.path.exists(f'{save_dir}/{id}.pth'): continue
+            label = record['caption']
+            caption_cot = record['caption_cot']
+            # if id in videos:
+            self.labels.append(label)
+            # self.labels[id] = label
+            self.videos.append(id)
+            self.caption_cot.append(caption_cot)
+            # else:
+            #     missing_videos.append(id)
+        log.info(f'{len(videos)} videos found in {root}')
+        log.info(f'{len(self.videos)} videos found in {tsv_path}')
+        log.info(f'{len(missing_videos)} videos missing in {root}')
+        self.sample_rate = sample_rate
+        self.duration_sec = duration_sec
+        self.expected_audio_length = self.audio_samples
+        self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
+        self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
+        self.clip_transform = v2.Compose([
+            v2.Lambda(pad_to_square),          # 先填充为正方形
+            v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+        ])
+        self.clip_processor = AutoProcessor.from_pretrained("facebook/metaclip-h14-fullcc2.5b")
+        self.sync_transform = v2.Compose([
+            v2.Resize(_SYNC_SIZE, interpolation=v2.InterpolationMode.BICUBIC),
+            v2.CenterCrop(_SYNC_SIZE),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+        self.resampler = {}
+    def sample(self, idx: int) -> dict[str, torch.Tensor]:
+        video_id = self.videos[idx]
+        label = self.labels[idx]
+        caption_cot = self.caption_cot[idx]
+        reader = StreamingMediaDecoder(self.root / (video_id + '.mp4'))
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
+            frame_rate=_CLIP_FPS,
+            format='rgb24',
+        )
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_SYNC_FPS * self.duration_sec),
+            frame_rate=_SYNC_FPS,
+            format='rgb24',
+        )
+        # reader.add_basic_audio_stream(frames_per_chunk=2**30,)
+        reader.fill_buffer()
+        data_chunk = reader.pop_chunks()
+        clip_chunk = data_chunk[0]
+        sync_chunk = data_chunk[1]
+        # audio_chunk = data_chunk[2]
+        # if len(audio_chunk.shape) != 2:
+        #     raise RuntimeError(f'error audio shape {video_id}')
+        if clip_chunk is None:
+            raise RuntimeError(f'CLIP video returned None {video_id}')
+        # if clip_chunk.shape[0] < self.clip_expected_length:
+        #     raise RuntimeError(
+        #         f'CLIP video too short {video_id}, expected {self.clip_expected_length}, got {clip_chunk.shape[0]}'
+        #     )
+        if sync_chunk is None:
+            raise RuntimeError(f'Sync video returned None {video_id}')
+        # truncate the video
+        clip_chunk = clip_chunk[:self.clip_expected_length]
+        # import ipdb
+        # ipdb.set_trace()
+        if clip_chunk.shape[0] != self.clip_expected_length:
+            current_length = clip_chunk.shape[0]
+            padding_needed = self.clip_expected_length - current_length
+            # Check that padding needed is no more than 2
+            # assert padding_needed < 4, f'Padding no more than 2 frames allowed, but {padding_needed} needed'
+            # If assertion passes, proceed with padding
+            if padding_needed > 0:
+                last_frame = clip_chunk[-1]
+                log.info(last_frame.shape)
+                # Repeat the last frame to reach the expected length
+                padding = last_frame.repeat(padding_needed, 1, 1, 1)
+                clip_chunk = torch.cat((clip_chunk, padding), dim=0)
+            # raise RuntimeError(f'CLIP video wrong length {video_id}, '
+            #                    f'expected {self.clip_expected_length}, '
+            #                    f'got {clip_chunk.shape[0]}')
+        # save_image(clip_chunk[0] / 255.0,'ori.png')
+        clip_chunk = pad_to_square(clip_chunk)
+        # save_image(clip_chunk[0] / 255.0,'square.png')
+        # clip_chunk = self.clip_transform(clip_chunk)
+        # import ipdb
+        # ipdb.set_trace()
+        clip_chunk = self.clip_processor(images=clip_chunk, return_tensors="pt")["pixel_values"]
+        # log.info(clip_chunk.shape)
+        # save_tensor_as_image(clip_chunk[0].numpy(),'scale.png')
+        # log.info(clip_chunk[0])
+        # clip_chunk = outputs
+        # text_ids = outputs["input_ids"]
+        # temp_img = clip_chunk[0].permute(1, 2, 0) * 255
+        # save_image(clip_chunk[0],'scale.png')
+        sync_chunk = sync_chunk[:self.sync_expected_length]
+        if sync_chunk.shape[0] != self.sync_expected_length:
+            # padding using the last frame, but no more than 2
+            current_length = sync_chunk.shape[0]
+            last_frame = sync_chunk[-1]
+            # 重复最后一帧以进行填充
+            padding = last_frame.repeat(self.sync_expected_length - current_length, 1, 1, 1)
+            # assert self.sync_expected_length - current_length < 12, f'sync can pad no more than 2 while {self.sync_expected_length - current_length}'
+            sync_chunk = torch.cat((sync_chunk, padding), dim=0)
+            # raise RuntimeError(f'Sync video wrong length {video_id}, '
+            #                    f'expected {self.sync_expected_length}, '
+            #                    f'got {sync_chunk.shape[0]}')
+        sync_chunk = self.sync_transform(sync_chunk)
+        # assert audio_chunk.shape[1] == self.expected_audio_length and clip_chunk.shape[0] == self.clip_expected_length \
+        # and sync_chunk.shape[0] == self.sync_expected_length, 'error processed data shape'
+        data = {
+            'id': video_id,
+            'caption': label,
+            # 'audio': audio_chunk,
+            'clip_video': clip_chunk,
+            'sync_video': sync_chunk,
+            'caption_cot': caption_cot,
+        }
+        return data
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        try:
+            return self.sample(idx)
+        except Exception as e:
+            log.error(f'Error loading video {self.videos[idx]}: {e}')
+            return None
+    def __len__(self):
+        return len(self.labels)
+# dataset = VGGSound(
+#         root="data/vggsound/video/train",
+#         tsv_path="data/vggsound/split_txt/temp.csv",
+#         sample_rate=44100,
+#         duration_sec=9.0,
+#         audio_samples=397312,
+#         start_row=0,
+#         end_row=None,
+#         save_dir="data/vggsound/video_224_latents_text/train"
+#     )
+# dataset[0]

data_utils/v2a_utils/vggsound_224_no_sync.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import os
+from pathlib import Path
+from typing import Optional, Union
+from PIL import Image
+import pandas as pd
+import torch
+import torchaudio
+from torch.utils.data.dataset import Dataset
+from torchvision.transforms import v2
+from torio.io import StreamingMediaDecoder
+from torchvision.utils import save_image
+from transformers import AutoProcessor
+import torch.nn.functional as F
+import numpy as np
+import logging
+log = logging.getLogger()
+_CLIP_SIZE = 224
+_CLIP_FPS = 8.0
+_SYNC_SIZE = 224
+_SYNC_FPS = 25.0
+def save_tensor_as_image(tensor, save_path):
+    """
+    将形状为 (1, 3, H, W) 的 RGB 图像数组保存为图片文件。
+    :param tensor: 输入的 NumPy 数组 (1, 3, H, W)。
+    :param save_path: 图片保存路径。
+    """
+    # # 移除批次维度，变成 (3, H, W)
+    # tensor = tensor.squeeze(0)
+    # 交换轴顺序，变为 (H, W, 3)
+    image_array = np.transpose(tensor, (1, 2, 0))
+    # 检查数组是否为合适的数据类型
+    if image_array.dtype != np.uint8:
+        # 如果不是 uint8，首先标准化，然后转换
+        image_array = (image_array - image_array.min()) / (image_array.max() - image_array.min()) * 255
+        image_array = image_array.astype(np.uint8)
+    # 创建图像对象
+    image = Image.fromarray(image_array)
+    # 保存图片
+    image.save(save_path)
+    print(f"Image saved to {save_path}")
+def pad_to_square(video_tensor):
+    # 验证输入的形状
+    if len(video_tensor.shape) != 4:
+        raise ValueError("Input tensor must have shape (l, c, h, w)")
+    l, c, h, w = video_tensor.shape
+    max_side = max(h, w)
+    # 计算每一维度需要的填充量：(left, right, top, bottom)
+    pad_h = max_side - h
+    pad_w = max_side - w
+    # 创建padding tuple (left, right, top, bottom)
+    # 因为图像的填充是作用在最后两个维度 h 和 w 上，所以我们需要指定这两个维度的填充
+    padding = (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2)
+    # 使用F.pad对视频张量进行填充操作
+    # 填充参数为 (left, right, top, bottom)
+    video_padded = F.pad(video_tensor, pad=padding, mode='constant', value=0)
+    return video_padded
+class VGGSound(Dataset):
+    def __init__(
+        self,
+        root: Union[str, Path],
+        *,
+        tsv_path: Union[str, Path] = 'dataset/vggsound/split_txt/train_caption.csv',
+        sample_rate: int = 44_100,
+        duration_sec: float = 9.0,
+        audio_samples: Optional[int] = 397312,
+        normalize_audio: bool = False,
+        start_row: Optional[int] = None,
+        end_row: Optional[int] = None,
+        save_dir: str = 'data/vggsound/video_latents_text/train'
+    ):
+        self.root = Path(root)
+        self.normalize_audio = normalize_audio
+        if audio_samples is None:
+            self.audio_samples = int(sample_rate * duration_sec)
+        else:
+            self.audio_samples = audio_samples
+            effective_duration = audio_samples / sample_rate
+            # make sure the duration is close enough, within 15ms
+            assert abs(effective_duration - duration_sec) < 0.015, \
+                f'audio_samples {audio_samples} does not match duration_sec {duration_sec}'
+        # videos = sorted(os.listdir(self.root))
+        # videos = set([Path(v).stem for v in videos])  # remove extensions
+        videos = []
+        self.labels = []
+        self.videos = []
+        missing_videos = []
+        # read the tsv for subset information
+        df_list = pd.read_csv(tsv_path, sep=',', dtype={'id': str}).to_dict('records')
+        # 控制处理的行范围
+        if start_row is not None and end_row is not None:
+            df_list = df_list[start_row:end_row]
+        for record in df_list:
+            id = record['id']
+            if os.path.exists(f'{save_dir}/{id}.pth'): continue
+            label = record['label']
+            # if id in videos:
+            self.labels.append(label)
+            # self.labels[id] = label
+            self.videos.append(id)
+            # else:
+            #     missing_videos.append(id)
+        log.info(f'{len(videos)} videos found in {root}')
+        log.info(f'{len(self.videos)} videos found in {tsv_path}')
+        log.info(f'{len(missing_videos)} videos missing in {root}')
+        self.sample_rate = sample_rate
+        self.duration_sec = duration_sec
+        self.expected_audio_length = self.audio_samples
+        self.clip_expected_length = int(_CLIP_FPS * self.duration_sec)
+        self.sync_expected_length = int(_SYNC_FPS * self.duration_sec)
+        self.clip_transform = v2.Compose([
+            v2.Lambda(pad_to_square),          # 先填充为正方形
+            v2.Resize((_CLIP_SIZE, _CLIP_SIZE), interpolation=v2.InterpolationMode.BICUBIC),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+        ])
+        self.clip_processor = AutoProcessor.from_pretrained("useful_ckpts/metaclip-huge")
+        self.resampler = {}
+    def sample(self, idx: int) -> dict[str, torch.Tensor]:
+        video_id = self.videos[idx]
+        label = self.labels[idx]
+        reader = StreamingMediaDecoder(self.root / (video_id + '.mp4'))
+        reader.add_basic_video_stream(
+            frames_per_chunk=int(_CLIP_FPS * self.duration_sec),
+            frame_rate=_CLIP_FPS,
+            format='rgb24',
+        )
+        reader.fill_buffer()
+        data_chunk = reader.pop_chunks()
+        clip_chunk = data_chunk[0]
+        if clip_chunk is None:
+            raise RuntimeError(f'CLIP video returned None {video_id}')
+        # truncate the video
+        clip_chunk = clip_chunk[:self.clip_expected_length]
+        # import ipdb
+        # ipdb.set_trace()
+        if clip_chunk.shape[0] != self.clip_expected_length:
+            current_length = clip_chunk.shape[0]
+            padding_needed = self.clip_expected_length - current_length
+            # Check that padding needed is no more than 2
+            assert padding_needed < 4, f'Padding no more than 2 frames allowed, but {padding_needed} needed'
+            # If assertion passes, proceed with padding
+            if padding_needed > 0:
+                last_frame = clip_chunk[-1]
+                log.info(last_frame.shape)
+                # Repeat the last frame to reach the expected length
+                padding = last_frame.repeat(padding_needed, 1, 1, 1)
+                clip_chunk = torch.cat((clip_chunk, padding), dim=0)
+            # raise RuntimeError(f'CLIP video wrong length {video_id}, '
+            #                    f'expected {self.clip_expected_length}, '
+            #                    f'got {clip_chunk.shape[0]}')
+        # save_image(clip_chunk[0] / 255.0,'ori.png')
+        clip_chunk = pad_to_square(clip_chunk)
+        # save_image(clip_chunk[0] / 255.0,'square.png')
+        # clip_chunk = self.clip_transform(clip_chunk)
+        # import ipdb
+        # ipdb.set_trace()
+        clip_chunk = self.clip_processor(images=clip_chunk, return_tensors="pt")["pixel_values"]
+        data = {
+            'id': video_id,
+            'caption': label,
+            'clip_video': clip_chunk,
+        }
+        return data
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        try:
+            return self.sample(idx)
+        except Exception as e:
+            log.error(f'Error loading video {self.videos[idx]}: {e}')
+            return None
+    def __len__(self):
+        return len(self.labels)
+# dataset = VGGSound(
+#         root="data/vggsound/video/train",
+#         tsv_path="data/vggsound/split_txt/temp.csv",
+#         sample_rate=44100,
+#         duration_sec=9.0,
+#         audio_samples=397312,
+#         start_row=0,
+#         end_row=None,
+#         save_dir="data/vggsound/video_224_latents_text/train"
+#     )
+# dataset[0]

data_utils/v2a_utils/vggsound_text.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import logging
+import os
+from pathlib import Path
+from typing import Optional, Union
+import pandas as pd
+import torch
+import torchaudio
+from torch.utils.data.dataset import Dataset
+from torchvision.transforms import v2
+from torio.io import StreamingMediaDecoder
+from torchvision.utils import save_image
+log = logging.getLogger()
+_CLIP_SIZE = 384
+_CLIP_FPS = 8.0
+_SYNC_SIZE = 224
+_SYNC_FPS = 25.0
+class VGGSound(Dataset):
+    def __init__(
+        self,
+        root: Union[str, Path],
+        *,
+        tsv_path: Union[str, Path] = 'dataset/vggsound/split_txt/train_caption.csv',
+        start_row: Optional[int] = None,
+        end_row: Optional[int] = None,
+        save_dir: str = 'data/vggsound/video_latents_text/train'
+    ):
+        self.root = Path(root)
+        # videos = sorted(os.listdir(self.root))
+        # videos = set([Path(v).stem for v in videos])  # remove extensions
+        videos = []
+        self.labels = []
+        self.cots = []
+        self.videos = []
+        missing_videos = []
+        # read the tsv for subset information
+        df_list = pd.read_csv(tsv_path, sep=',', dtype={'id': str}).to_dict('records')
+        # 控制处理的行范围
+        if start_row is not None and end_row is not None:
+            df_list = df_list[start_row:end_row]
+        for record in df_list:
+            id = record['id']
+            # if os.path.exists(f'{save_dir}/{id}.pth'):
+            #     continue
+                # try:
+                #     torch.load(f'{save_dir}/{id}.pth')
+                #     continue
+                # except:
+                #     print(f'error load file: {save_dir}/{id}.pth')
+                #     os.system(f'rm -f {save_dir}/{id}.pth')
+            label = record['caption']
+            # if id in videos:
+            self.labels.append(label)
+            self.cots.append(record['caption_cot'])
+            # self.labels[id] = label
+            self.videos.append(id)
+            # else:
+            #     missing_videos.append(id)
+        log.info(f'{len(videos)} videos found in {root}')
+        log.info(f'{len(self.videos)} videos found in {tsv_path}')
+        log.info(f'{len(missing_videos)} videos missing in {root}')
+    def sample(self, idx: int) -> dict[str, torch.Tensor]:
+        video_id = self.videos[idx]
+        label = self.labels[idx]
+        cot = self.cots[idx]
+        data = {
+            'id': video_id,
+            'caption': label,
+            'caption_cot': cot
+        }
+        return data
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        try:
+            return self.sample(idx)
+        except Exception as e:
+            log.error(f'Error loading video {self.videos[idx]}: {e}')
+            return None
+    def __len__(self):
+        return len(self.labels)
+# dataset = VGGSound(
+#         root="data/vggsound/video/test",
+#         tsv_path="data/vggsound/split_txt/temp.csv",
+#         sample_rate=44100,
+#         duration_sec=9.0,
+#         audio_samples=397312,
+#         start_row=0,
+#         end_row=None,
+#         save_dir="data/vggsound/video_latents_text/test"
+#     )
+# dataset[0]

defaults.ini ADDED Viewed

	@@ -0,0 +1,68 @@

+[DEFAULTS]
+#name of the run
+name = stable_audio_tools
+# the batch size
+batch_size = 8
+test_batch_size = 1
+# predict ckpt directory
+ckpt_dir = "ckpts"
+# number of GPUs to use for training
+num_gpus = 1
+# number of nodes to use for training
+num_nodes = 1
+# Multi-GPU strategy for PyTorch Lightning
+strategy = ""
+# Precision to use for training
+precision = "bf16-mixed"
+# number of CPU workers for the DataLoader
+num_workers = 8
+# the random seed
+seed = 42
+# Batches for gradient accumulation
+accum_batches = 1
+# Number of steps between checkpoints
+checkpoint_every = 2000
+# trainer checkpoint file to restart training from
+ckpt_path = ''
+# model checkpoint file to start a new training run from
+pretrained_ckpt_path = ''
+# Checkpoint path for the pretransform model if needed
+pretransform_ckpt_path = ''
+# configuration model specifying model hyperparameters
+model_config = ''
+# configuration for datasets
+dataset_config = ''
+# directory to save the checkpoints in
+save_dir = ''
+# gradient_clip_val passed into PyTorch Lightning Trainer
+gradient_clip_val = 0.0
+# remove the weight norm from the pretransform model
+remove_pretransform_weight_norm = ''
+compile = False
+repeat_num = 5
+duration_sec = '9'
+results_dir = 'results'

demo_test.csv ADDED Viewed

	@@ -0,0 +1,17 @@

+id,caption,caption_cot
+W1nb2hIeDKc_000021,striking bowling,"Start with a background of ambient music, then add consistent sounds of bowling balls striking pins to emphasize the action. Include occasional subtle sounds of pins rattling and settling. Keep human voices or other noises minimal or absent for authenticity."
+YYRdv32TJnc_000184,plastic bottle crushing,"Start with the sound of crushing plastic bottles, including crinkling and crunching. Add background noise resembling a factory environment, with machinery sounds. Incorporate subtle rustling and paper crinkling to suggest manipulation of plastic items."
+Rp39_WnX5Fk_000380,"subway, metro, underground","Generate subway sounds including ambient station noise, train doors opening and closing, engine hum, wheels on tracks, and conductor announcements to produce an accurate underground train environment."
+-KqXcm-I2zY_000087,playing tennis,"Generate sounds of tennis hitting a racket, the ball bouncing, and the girl’s grunts, with distant tennis court ambient noise. Avoid unrelated sounds like horses, basketballs, or indoor voices. Focus on clear tennis scene with realistic audio cues."
+0W_wPc-zV3I_000101,hedge trimmer running,"Generate the sound of a hedge trimmer running steadily, focusing on consistent motor noise and cutting sounds. Ensure minimal background noise or voices, capturing the primary sound of the trimmer in operation. Avoid including any chainsaw or unrelated sounds for accuracy."
+_Betmm6FaWo_000096,writing on blackboard with chalk,"The audio should feature consistent sounds of chalk scratching the blackboard, including occasional voice instructions, encouragement, and children’s chatter, with background music playing softly or fading in/out to match the scene's atmosphere. The sounds of laughter and chatter should be lively but balanced with the primary chalk and voice sounds for clarity. Overall, the audio combines educational sounds with background activity to reflect a classroom or play environment."
+xmTfE3F2huE_000854,chopping food,"Generate rhythmic chopping sounds consistent with meat or food being sliced, incorporating occasional rustling noises like a plastic bag. Avoid adding human voices or train sounds to match the correct audio descriptions, ensuring a focused, realistic kitchen chopping scene."
+ZaUaqnLdg6k_000030,skateboarding,"Generate the audio featuring skateboarding sounds with wheels rolling on various surfaces, including ramps, rails, and sidewalks, capturing the sound of tricks and landings. Include subtle ambient background noise to suggest an outdoor setting, avoiding any human voices or singing. Focus on realistic skateboarding sounds, emphasizing wheel contact, impacts, and movement."
+_ZC6yk5iE1I_000026,playing trumpet,"Generate a continuous trumpet sound with melodic variations, mimicking the sound of a person playing the trumpet idealy in a musical setting, ensuring clarity and realistic tone. Avoid extraneous noise or background sounds to reflect the focus on trumpet playing. The audio should resemble a skilled player producing expressive, melodious trumpet notes."
+55L7peYRB_Q_000120,using sewing machines,"Generate ambient sewing room sounds with consistent sewing machine hum, minimal background noise, and no human voices, focusing on characteristic machine noise to match the correct descriptions."
+4p8n4Zf-WMM_000190,lighting firecrackers,"Generate the sound of firecrackers lighting and exploding repeatedly, mixed with distant background sounds of crickets chirping. Incorporate occasional subtle echoes to mimic outdoor night ambiance, with no human voices present. End with a series of sharp cracker bursts to create a lively, festive atmosphere."
+yLazKv68TeA_000078,people eating crisps,"Create audio with consistent crisp sounds of people eating chips, including crinkling paper and breathing. Include subtle chewing noises to match the activity. Avoid background music or voices for clarity."
+_XyxrZDZ36E_000034,hammering nails,"Generate audio with consistent hammering sounds, featuring a rhythmic pattern of nails being driven into a surface, with occasional ambient background sounds like birds chirping and distant traffic. Avoid human voices, focusing on realistic hammer strikes and natural outdoor environment sounds. Ensure the hammering tone is steady and clear, matching the description of continuous nail hammering."
+1u1orBeV4xI_000428,ripping paper,"Start with a subtle tearing sound of paper being ripped, emphasizing a continuous, consistent noise. Ensure the sound has slight variations to mimic real tearing. No background or additional noises are needed, focusing solely on the tearing action."
+JFG4YvcJ3bo_000228,playing bongo,"Generate a lively percussion track featuring rhythmic djembe beats, with a melodic guitar strumming softly in the background to enhance the musical atmosphere. Ensure no human voice is included, focusing on the percussive and guitar sounds. Maintain a natural, well-balanced stereo mix to highlight the instruments' interplay."
+1pViEqMXJH0_000030,printer printing,"Generate a continuous printer printing sound with periodic beeps, resembling typical printer noise, including paper movement and occasional beeps for realism. Add subtle ambient background noise, like faint room sounds, to enhance authenticity. Ensure the primary focus remains on the printing and beeping sounds, consistent with the correct audio descriptions."

examples/1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8884c466292b46510c298a9ee88d8a584c86cb750afb558108c0850413e21e51
+size 634576

examples/1_mute.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0ca4223b15583d8099d023bac3e86725bfd5cfbbad771ef67d31c1ad953bdc3
+size 482981

examples/2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8c7a3dd144c91d690b07892b30544c24000008873116451291f59553f3908a4
+size 368050

examples/2_mute.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e09de17f3d3f631a5a7cd5dfb2b5b32a78fbcd3c1b90673dfa36ce798647f1e8
+size 216098

examples/3.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9dccddd67a954b12d34d481107c499460c69bebd92913b8f092724fcdf1c5baf
+size 1716778

examples/3_mute.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:103ed8d5e4fbe8d6954dde463c2a997acd0e21c13895404706dbbaab39e2b086
+size 1564981

examples/4.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3a399f94d5748b77497e28e1fca4e70c3900e1583cab4ecaefb242507b9fe1b
+size 3642290

examples/4_mute.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9aa6d9ef7523cec4f5e0087d9f6c6b86efceb694f0433da5d07bdf57eea1247
+size 3490447

examples/5.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:189a6cfcac18470a3f18285013f63f46c7b5996f31e0d3ecf617d9f7f91fdfeb
+size 738718

examples/5_mute.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:103c2b1517d9cbecd81b394c93ebe36f166e95f635ceee280c28073233f08173
+size 586982

extract_latents.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import argparse
+import os
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import logging
+from data_utils.v2a_utils.vggsound_224_no_audio import VGGSound
+from data_utils.v2a_utils.feature_utils_224 import FeaturesUtils
+import torchaudio
+from einops import rearrange
+from torch.utils.data.dataloader import default_collate
+import numpy as np
+from huggingface_hub import hf_hub_download
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def setup(rank, world_size):
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+    torch.cuda.set_device(rank)
+def cleanup():
+    dist.destroy_process_group()
+def error_avoidance_collate(batch):
+    batch = list(filter(lambda x: x is not None, batch))
+    return default_collate(batch)
+def main(args):
+    print(f"Using root: {args.root}, tsv_path: {args.tsv_path}, save_dir: {args.save_dir}")
+    dataset = VGGSound(
+        root=args.root,
+        tsv_path=args.tsv_path,
+        sample_rate=args.sample_rate,
+        duration_sec=args.duration_sec,
+        audio_samples=args.audio_samples,
+        start_row=args.start_row,
+        end_row=args.end_row,
+        save_dir=args.save_dir
+    )
+    save_dir = args.save_dir
+    os.makedirs(save_dir, exist_ok=True)
+    dataloader = DataLoader(dataset, batch_size=2, num_workers=8, drop_last=False,collate_fn=error_avoidance_collate)
+    print(f"Dataset length: {len(dataset)}")
+    feature_extractor = FeaturesUtils(
+        vae_ckpt=None,
+        vae_config=args.vae_config,
+        enable_conditions=True,
+        synchformer_ckpt=args.synchformer_ckpt
+    ).eval().cuda()
+    feature_extractor = feature_extractor
+    for i, data in enumerate(tqdm(dataloader, desc="Processing", unit="batch")):
+        ids = data['id']
+        with torch.no_grad():
+            # audio = data['audio'].cuda(rank, non_blocking=True)
+            output = {
+                'caption': str(data['caption']),
+                'caption_cot': str(data['caption_cot'])
+            }
+            print(output)
+            # latent = feature_extractor.module.encode_audio(audio)
+            # output['latent'] = latent.detach().cpu()
+            clip_video = data['clip_video'].cuda()
+            clip_features = feature_extractor.encode_video_with_clip(clip_video)
+            output['metaclip_features'] = clip_features.detach().cpu()
+            sync_video = data['sync_video'].cuda()
+            sync_features = feature_extractor.encode_video_with_sync(sync_video)
+            output['sync_features'] = sync_features.detach().cpu()
+            caption = data['caption']
+            metaclip_global_text_features, metaclip_text_features = feature_extractor.encode_text(caption)
+            output['metaclip_global_text_features'] = metaclip_global_text_features.detach().cpu()
+            output['metaclip_text_features'] = metaclip_text_features.detach().cpu()
+            caption_cot = data['caption_cot']
+            t5_features = feature_extractor.encode_t5_text(caption_cot)
+            output['t5_features'] = t5_features.detach().cpu()
+            for j in range(len(ids)):
+                sample_output = {
+                    'id': ids[j],
+                    'caption': output['caption'][j],
+                    'caption_cot': output['caption_cot'][j],
+                    # 'latent': output['latent'][j],
+                    'metaclip_features': output['metaclip_features'][j],
+                    'sync_features': output['sync_features'][j],
+                    'metaclip_global_text_features': output['metaclip_global_text_features'][j],
+                    'metaclip_text_features': output['metaclip_text_features'][j],
+                    't5_features': output['t5_features'][j],
+                }
+                # torch.save(sample_output, f'{save_dir}/{ids[j]}.pth')
+                np.savez(f'{save_dir}/demo.npz', **sample_output)
+        ## test the sync between videos and audios
+        # torchaudio.save(f'input_{i}.wav',data['audio'],sample_rate=44100)
+        # recon_audio = feature_extractor.decode_audio(latent)
+        # recon_audio = rearrange(recon_audio, "b d n -> d (b n)")
+        # id = data['id']
+        # torchaudio.save(f'recon_{i}.wav',recon_audio.cpu(),sample_rate=44100)
+        # os.system(f'ffmpeg -y -i dataset/vggsound/video/train/{id}.mp4 -i recon_{i}.wav -t 9 -map 0:v -map 1:a -c:v copy -c:a aac -strict experimental -shortest out_{i}.mp4')
+        # os.system(f'ffmpeg -y -i dataset/vggsound/video/train/{id}.mp4 -i input_{i}.wav -t 9 -map 0:v -map 1:a -c:v copy -c:a aac -strict experimental -shortest input_{i}.mp4')
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Extract Video Training Latents')
+    parser.add_argument('--root', type=str, default='videos', help='Root directory of the video dataset')
+    parser.add_argument('--tsv_path', type=str, default='cot_coarse/cot.csv', help='Path to the TSV file')
+    parser.add_argument('--save-dir', type=str, default='results', help='Save Directory')
+    parser.add_argument('--sample_rate', type=int, default=44100, help='Sample rate of the audio')
+    parser.add_argument('--duration_sec', type=float, default=9.0, help='Duration of the audio in seconds')
+    parser.add_argument('--vae_ckpt', type=str, default='ckpts/vae.ckpt', help='Path to the VAE checkpoint')
+    parser.add_argument('--vae_config', type=str, default='ThinkSound/configs/model_configs/stable_audio_2_0_vae.json', help='Path to the VAE configuration file')
+    parser.add_argument('--synchformer_ckpt', type=str, default='ckpts/synchformer_state_dict.pth', help='Path to the Synchformer checkpoint')
+    parser.add_argument('--start-row', type=int, default=0, help='start row')
+    parser.add_argument('--end-row', type=int, default=None, help='end row')
+    args = parser.parse_args()
+    args.audio_samples = int(args.sample_rate * args.duration_sec)
+    main(args=args)

predict.py ADDED Viewed

	@@ -0,0 +1,214 @@

+from prefigure.prefigure import get_all_args, push_wandb_config
+import json
+import os
+import re
+import torch
+import torchaudio
+# import pytorch_lightning as pl
+import lightning as L
+from lightning.pytorch.callbacks import Timer, ModelCheckpoint, BasePredictionWriter
+from lightning.pytorch.callbacks import Callback
+from lightning.pytorch.tuner import Tuner
+from lightning.pytorch import seed_everything
+import random
+from datetime import datetime
+from ThinkSound.data.datamodule import DataModule
+from ThinkSound.models import create_model_from_config
+from ThinkSound.models.utils import load_ckpt_state_dict, remove_weight_norm_from_model
+from ThinkSound.training import create_training_wrapper_from_config, create_demo_callback_from_config
+from ThinkSound.training.utils import copy_state_dict
+from huggingface_hub import hf_hub_download
+class ExceptionCallback(Callback):
+    def on_exception(self, trainer, module, err):
+        print(f'{type(err).__name__}: {err}')
+class ModelConfigEmbedderCallback(Callback):
+    def __init__(self, model_config):
+        self.model_config = model_config
+    def on_save_checkpoint(self, trainer, pl_module, checkpoint):
+        checkpoint["model_config"] = self.model_config
+class CustomWriter(BasePredictionWriter):
+    def __init__(self, output_dir, write_interval='batch', batch_size=32):
+        super().__init__(write_interval)
+        self.output_dir = output_dir
+        self.batch_size = batch_size
+    def write_on_batch_end(self, trainer, pl_module, predictions, batch_indices, batch, batch_idx, dataloader_idx):
+        audios = predictions
+        ids = [item['id'] for item in batch[1]]
+        current_date = datetime.now()
+        formatted_date = current_date.strftime('%m%d')
+        os.makedirs(os.path.join(self.output_dir, f'{formatted_date}_batch_size{self.batch_size}'),exist_ok=True)
+        for audio, id in zip(audios, ids):
+            save_path = os.path.join(self.output_dir, f'{formatted_date}_batch_size{self.batch_size}', f'{id}.wav')
+            torchaudio.save(save_path, audio, 44100)
+def main():
+    args = get_all_args()
+    # args.pretransform_ckpt_path = hf_hub_download(
+    #     repo_id="liuhuadai/ThinkSound",
+    #     filename="vae.ckpt"
+    # )
+    args.pretransform_ckpt_path = "./ckpts/vae.ckpt"
+    seed = 10086
+    # Set a different seed for each process if using SLURM
+    if os.environ.get("SLURM_PROCID") is not None:
+        seed += int(os.environ.get("SLURM_PROCID"))
+    # random.seed(seed)
+    # torch.manual_seed(seed)
+    seed_everything(seed, workers=True)
+    #Get JSON config from args.model_config
+    with open(args.model_config) as f:
+        model_config = json.load(f)
+    with open(args.dataset_config) as f:
+        dataset_config = json.load(f)
+    for td in dataset_config["test_datasets"]:
+        td["path"] = args.results_dir
+    # train_dl = create_dataloader_from_config(
+    #     dataset_config,
+    #     batch_size=args.batch_size,
+    #     num_workers=args.num_workers,
+    #     sample_rate=model_config["sample_rate"],
+    #     sample_size=model_config["sample_size"],
+    #     audio_channels=model_config.get("audio_channels", 2),
+    # )
+    duration=(float)(args.duration_sec)
+    dm = DataModule(
+        dataset_config,
+        batch_size=args.batch_size,
+        test_batch_size=args.test_batch_size,
+        num_workers=args.num_workers,
+        sample_rate=model_config["sample_rate"],
+        sample_size=(float)(args.duration_sec) * model_config["sample_rate"],
+        audio_channels=model_config.get("audio_channels", 2),
+        latent_length=round(44100/64/32*duration),
+    )
+    model_config["sample_size"] = duration * model_config["sample_rate"]
+    model_config["model"]["diffusion"]["config"]["sync_seq_len"] = 24*int(duration)
+    model_config["model"]["diffusion"]["config"]["clip_seq_len"] = 8*int(duration)
+    model_config["model"]["diffusion"]["config"]["latent_seq_len"] = round(44100/64/32*duration)
+    model = create_model_from_config(model_config)
+    ## speed by torch.compile
+    if args.compile:
+        model = torch.compile(model)
+    if args.pretrained_ckpt_path:
+        copy_state_dict(model, load_ckpt_state_dict(args.pretrained_ckpt_path,prefix='diffusion.')) # autoencoder.  diffusion.
+    if args.remove_pretransform_weight_norm == "pre_load":
+        remove_weight_norm_from_model(model.pretransform)
+    # import ipdb
+    # ipdb.set_trace()
+    if args.pretransform_ckpt_path:
+        load_vae_state = load_ckpt_state_dict(args.pretransform_ckpt_path, prefix='autoencoder.')
+        # new_state_dict = {k.replace("autoencoder.", ""): v for k, v in load_vae_state.items() if k.startswith("autoencoder.")}
+        model.pretransform.load_state_dict(load_vae_state)
+    # Remove weight_norm from the pretransform if specified
+    if args.remove_pretransform_weight_norm == "post_load":
+        remove_weight_norm_from_model(model.pretransform)
+    training_wrapper = create_training_wrapper_from_config(model_config, model)
+    # wandb_logger = L.pytorch.loggers.WandbLogger(project=args.name)
+    # wandb_logger.watch(training_wrapper)
+    exc_callback = ExceptionCallback()
+    # if args.save_dir and isinstance(wandb_logger.experiment.id, str):
+    #     checkpoint_dir = os.path.join(args.save_dir, wandb_logger.experiment.project, wandb_logger.experiment.id, "checkpoints")
+    # else:
+    #     checkpoint_dir = None
+    # ckpt_callback = ModelCheckpoint(every_n_train_steps=args.checkpoint_every, dirpath=checkpoint_dir, monitor='val_loss', mode='min', save_top_k=10)
+    save_model_config_callback = ModelConfigEmbedderCallback(model_config)
+    audio_dir = args.results_dir
+    pred_writer = CustomWriter(output_dir=audio_dir, write_interval="batch", batch_size=args.test_batch_size)
+    timer = Timer(duration="00:15:00:00")
+    demo_callback = create_demo_callback_from_config(model_config, demo_dl=dm)
+    #Combine args and config dicts
+    args_dict = vars(args)
+    args_dict.update({"model_config": model_config})
+    args_dict.update({"dataset_config": dataset_config})
+    # push_wandb_config(wandb_logger, args_dict)
+    #Set multi-GPU strategy if specified
+    if args.strategy:
+        if args.strategy == "deepspeed":
+            from pytorch_lightning.strategies import DeepSpeedStrategy
+            strategy = DeepSpeedStrategy(stage=2,
+                                        contiguous_gradients=True,
+                                        overlap_comm=True,
+                                        reduce_scatter=True,
+                                        reduce_bucket_size=5e8,
+                                        allgather_bucket_size=5e8,
+                                        load_full_weights=True
+                                        )
+        else:
+            strategy = args.strategy
+    else:
+        strategy = 'ddp_find_unused_parameters_true' if args.num_gpus > 1 else "auto"
+    trainer = L.Trainer(
+        devices=args.num_gpus,
+        accelerator="gpu",
+        num_nodes = args.num_nodes,
+        strategy=strategy,
+        precision=args.precision,
+        accumulate_grad_batches=args.accum_batches,
+        callbacks=[demo_callback, exc_callback, save_model_config_callback, timer, pred_writer],
+        log_every_n_steps=1,
+        max_epochs=1000,
+        default_root_dir=args.save_dir,
+        gradient_clip_val=args.gradient_clip_val,
+        reload_dataloaders_every_n_epochs = 0,
+        check_val_every_n_epoch=2,
+    )
+    # ckpt_path = hf_hub_download(
+    #     repo_id="liuhuadai/ThinkSound",
+    #     filename="thinksound.ckpt"
+    # )
+    ckpt_path = 'ckpts/thinksound.ckpt'
+    current_date = datetime.now()
+    formatted_date = current_date.strftime('%m%d')
+    audio_dir = f'{formatted_date}_step68k_batch_size'+str(args.test_batch_size)
+    metrics_path = os.path.join(args.ckpt_dir, 'audios',audio_dir,'cache',"output_metrics.json")
+    # if os.path.exists(metrics_path): continue
+    trainer.predict(training_wrapper, dm, return_predictions=False,ckpt_path=ckpt_path)
+if __name__ == '__main__':
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"

requirements.txt ADDED Viewed

	@@ -0,0 +1,254 @@

+modelscope
+absl-py==2.2.2
+accelerate==1.6.0
+aeiou==0.0.20
+aiobotocore==2.22.0
+aiofiles==23.2.1
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.18
+aioitertools==0.12.0
+aiosignal==1.3.2
+alias-free-torch==0.0.6
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.9.0
+appdirs==1.4.4
+argbind==0.3.9
+asttokens==3.0.0
+async-timeout==5.0.1
+attrs==25.3.0
+audiobox_aesthetics==0.0.2
+audioread==3.0.1
+auraloss==0.4.0
+av==14.4.0
+bleach==6.2.0
+bokeh==3.7.3
+botocore==1.37.3
+braceexpand==0.1.7
+Brotli==1.1.0
+certifi==2025.4.26
+cffi==1.17.1
+charset-normalizer==3.4.1
+clean-fid==0.1.35
+click==8.1.8
+clip-anytorch==2.6.0
+cloudpickle==3.1.1
+colorcet==3.1.0
+colorlog==6.9.0
+configparser==7.2.0
+contourpy==1.3.2
+cycler==0.12.1
+Cython==3.1.1
+dctorch==0.1.2
+decorator==4.4.2
+decord==0.6.0
+descript-audio-codec==1.0.0
+docker-pycreds==0.4.0
+docstring_parser==0.16
+einops==0.7.0
+einops-exts==0.0.4
+ema-pytorch==0.2.3
+encodec==0.1.1
+exceptiongroup==1.2.2
+executing==2.2.0
+fastapi==0.115.12
+fastcore==1.8.2
+ffmpeg==1.4
+ffmpy==0.5.0
+filelock==3.18.0
+fire==0.7.0
+flatten-dict==0.4.2
+fonttools==4.58.0
+frozenlist==1.6.0
+fsspec==2025.5.0
+ftfy==6.3.1
+future==1.0.0
+fvcore==0.1.5.post20221221
+gin-config==0.5.0
+gitdb==4.0.12
+GitPython==3.1.44
+gradio==3.50.0
+gradio_client==0.6.1
+groovy==0.1.2
+grpcio==1.71.0
+h11==0.16.0
+hf_xet
+h5py==3.13.0
+hjson==3.1.0
+holoviews==1.20.2
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.30.2
+hydra-colorlog==1.2.0
+hydra-core==1.3.2
+idna==3.10
+imageio==2.37.0
+imageio-ffmpeg==0.4.9
+importlib-resources==5.12.0
+importlib_metadata==8.7.0
+iopath==0.1.10
+ipython==8.36.0
+jedi==0.19.2
+Jinja2==3.1.0
+jmespath==1.0.1
+joblib==1.5.0
+jsonmerge==1.9.2
+jsonschema==4.23.0
+jsonschema-specifications==2025.4.1
+julius==0.2.7
+k-diffusion==0.1.1
+kiwisolver==1.4.8
+kornia==0.8.1
+kornia_rs==0.1.9
+laion-clap==1.1.4
+latex2mathml==3.77.0
+lazy_loader==0.4
+librosa==0.9.2
+lightning==2.5.1.post0
+lightning-utilities==0.14.3
+linkify-it-py==2.0.3
+llvmlite==0.43.0
+local-attention==1.8.6
+Markdown==3.8
+markdown-it-py==3.0.0
+markdown2==2.5.3
+MarkupSafe==2.1.5
+matplotlib==3.10.3
+matplotlib-inline==0.1.7
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+moviepy==1.0.3
+mpmath==1.3.0
+multidict==6.4.4
+multiprocessing-logging==0.2.4
+mutagen==1.47.0
+narwhals==1.40.0
+networkx==3.4.2
+ninja==1.11.1.3
+nitrous_ema==0.0.1
+numba==0.60.0
+numpy==1.23.5
+omegaconf==2.3.0
+open_clip_torch==2.32.0
+openai==1.33.0
+opencv-python==4.11.0.86
+orjson==3.10.18
+pafy==0.5.3.1
+pandas==2.0.2
+panel==1.7.0
+param==2.2.0
+parameterized==0.9.0
+parso==0.8.4
+pathtools==0.1.2
+pedalboard==0.7.4
+pexpect==4.9.0
+pillow
+platformdirs==4.3.8
+plotly==6.1.1
+pooch==1.8.2
+prefigure==0.0.9
+proglog==0.1.10
+progressbar==2.5
+prompt_toolkit==3.0.51
+propcache==0.3.1
+protobuf==3.19.6
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-cpuinfo==9.0.0
+pycparser==2.22
+pydantic==2.11.5
+pydantic_core==2.33.2
+pydub==0.25.1
+Pygments==2.19.1
+pyloudnorm==0.1.1
+pynndescent==0.5.13
+pynvml==12.0.0
+pyparsing==3.2.3
+pystoi==0.4.1
+pysubs2==1.8.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.20
+pytorch-lightning==2.5.1.post0
+pytorchvideo==0.1.5
+pytz==2025.2
+pyviz_comms==3.0.4
+PyWavelets==1.4.1
+PyYAML==6.0.2
+randomname==0.2.1
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+resampy==0.4.3
+rich==14.0.0
+rpds-py==0.25.1
+ruff==0.11.11
+s3fs==2025.5.0
+safehttpx==0.1.6
+safetensors==0.5.3
+scenedetect==0.6.3
+scikit-image==0.24.0
+scikit-learn==1.6.1
+scipy==1.15.3
+semantic-version==2.10.0
+sentencepiece==0.1.99
+sentry-sdk==2.29.1
+setproctitle==1.3.6
+shellingham==1.5.4
+shortuuid==1.0.13
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+SoundFile==0.10.2
+sox==1.3.0
+stack-data==0.6.3
+starlette==0.46.2
+submitit==1.5.2
+svgwrite==1.4.3
+sympy==1.13.1
+tabulate==0.9.0
+tensorboard-data-server==0.7.2
+termcolor==3.1.0
+threadpoolctl==3.6.0
+tifffile==2025.5.10
+timm==1.0.15
+tokenizers==0.19
+tomlkit==0.13.2
+torch==2.4.0
+torch-stoi==0.2.3
+torchaudio==2.4.0
+torchdiffeq==0.2.5
+torchlibrosa==0.1.0
+torchmetrics==0.11.4
+torchsde==0.2.6
+torchvision==0.19.0
+tornado==6.5.1
+git+https://github.com/patrick-kidger/torchcubicspline.git
+tqdm==4.67.1
+traitlets==5.14.3
+trampoline==0.1.2
+transformers==4.43
+triton==3.0.0
+typer==0.15.4
+typing-inspection==0.4.1
+typing_extensions==4.12.2
+tzdata==2025.2
+uc-micro-py==1.0.3
+umap-learn==0.5.7
+urllib3==2.4.0
+uvicorn==0.34.2
+v-diffusion-pytorch==0.0.2
+vector-quantize-pytorch==1.9.14
+wcwidth==0.2.13
+webdataset==0.2.48
+webencodings==0.5.1
+Werkzeug==3.1.3
+wget==3.2
+wrapt==1.17.2
+x-transformers==1.26.6
+xyzservices==2025.4.0
+yacs==0.1.8
+yarl==1.20.0
+zipp==3.21.0
+altair==5.5.0

scripts/demo.sh ADDED Viewed

	@@ -0,0 +1,82 @@

+#!/bin/bash
+# Check number of arguments
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <video_path> <title> <description>"
+    exit 1
+fi
+VIDEO_PATH="$1"
+TITLE="$2"
+DESCRIPTION="$3"
+# Generate unique ID
+UNIQUE_ID=$(uuidgen | cut -c 1-8)
+# Create necessary directories
+mkdir -p videos cot_coarse results
+# Get video filename and extension
+VIDEO_FILE=$(basename "$VIDEO_PATH")
+VIDEO_EXT="${VIDEO_FILE##*.}"
+VIDEO_ID="${VIDEO_FILE%.*}"
+TEMP_VIDEO_PATH="videos/${VIDEO_ID}_${UNIQUE_ID}.mp4"
+# Convert video to MP4 format if needed
+if [ "${VIDEO_EXT,,}" != "mp4" ]; then
+    echo "⏳ Converting video to MP4 format..."
+    ffmpeg -y -i "$VIDEO_PATH" -c:v libx264 -preset fast -c:a aac -strict experimental "$TEMP_VIDEO_PATH" >/dev/null 2>&1
+    if [ $? -ne 0 ]; then
+        echo "❌ Video conversion failed"
+        exit 2
+    fi
+else
+    cp "$VIDEO_PATH" "$TEMP_VIDEO_PATH"
+fi
+# Calculate video duration
+DURATION=$(ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "$TEMP_VIDEO_PATH")
+DURATION_SEC=${DURATION%.*}
+echo "Duration is: $DURATION_SEC"
+# Create cot.csv file
+CAPTION_COT=$(echo "$DESCRIPTION" | tr '"' "'")
+CSV_PATH="cot_coarse/cot.csv"
+echo "id,caption,caption_cot" > "$CSV_PATH"
+echo "${VIDEO_ID}_${UNIQUE_ID},$TITLE,\"$CAPTION_COT\"" >> "$CSV_PATH"
+# Run feature extraction
+echo "⏳ Extracting features..."
+python extract_latents.py --duration_sec "$DURATION_SEC" 2>&1
+if [ $? -ne 0 ]; then
+    echo "❌ Feature extraction failed"
+    rm -f "$TEMP_VIDEO_PATH"
+    exit 3
+fi
+# Run inference
+echo "⏳ Running model inference..."
+bash scripts/infer.sh --duration-sec "$DURATION_SEC" 2>&1
+if [ $? -ne 0 ]; then
+    echo "❌ Inference failed"
+    rm -f "$TEMP_VIDEO_PATH"
+    exit 4
+fi
+# Get generated audio file
+CURRENT_DATE=$(date +"%m%d")
+AUDIO_PATH="results/${CURRENT_DATE}_batch_size1/demo.wav"
+# Check if audio file exists
+if [ ! -f "$AUDIO_PATH" ]; then
+    echo "❌ Generated audio file not found"
+    rm -f "$TEMP_VIDEO_PATH"
+    exit 5
+fi
+# Clean up temporary video file
+rm -f "$TEMP_VIDEO_PATH"
+echo "✅ Audio generated successfully!"
+echo "Audio file path: $AUDIO_PATH"

scripts/infer.sh ADDED Viewed

	@@ -0,0 +1,76 @@

+#!/bin/bash
+# 变量定义
+ckpt_dir="ckpts/thinksound.ckpt"
+test_batch_size=1
+dataset_config="ThinkSound/configs/multimodal_dataset_demo.json"
+model_config="ThinkSound/configs/model_configs/thinksound.json"
+pretransform_ckpt_path="ckpts/vae.ckpt"
+# 默认值
+debug_mode="true"
+node_rank=0
+result_path="results"
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --duration-sec)
+      if [[ -n "$2" && "$2" != --* ]]; then
+        duration_sec="$2"
+        shift 2
+      else
+        echo "❌ Argument --duration-sec requires a value"
+        exit 1
+      fi
+      ;;
+    --result-path)
+      if [[ -n "$2" && "$2" != --* ]]; then
+        result_path="$2"
+        shift 2
+      else
+        echo "❌ Argument --result-path requires a path"
+        exit 1
+      fi
+      ;;
+    *)
+      echo "❌ Unknown argument: $1"
+      exit 1
+      ;;
+  esac
+done
+export NODE_RANK=$node_rank
+export RANK=$node_rank
+num_gpus=1
+num_nodes=1
+export WORLD_SIZE=$((num_gpus * num_nodes))
+# 打印配置信息
+echo "Training Configuration:"
+echo "Checkpoint Directory: $ckpt_dir"
+echo "Dataset Config: $dataset_config"
+echo "Model Config: $model_config"
+echo "Pretransform Checkpoint Path: $pretransform_ckpt_path"
+echo "Num GPUs: $num_gpus"
+echo "Num Nodes: $num_nodes"
+echo "Test Batch Size: $test_batch_size"
+echo "Num Workers: 20"
+echo "Node Rank: $node_rank"
+echo "WORLD SIZE: $WORLD_SIZE"
+python predict.py \
+        --dataset-config "$dataset_config" \
+        --model-config "$model_config" \
+        --ckpt-dir "$ckpt_dir" \
+        --pretransform-ckpt-path "$pretransform_ckpt_path" \
+        --checkpoint-every 2000 \
+        --num-gpus "$num_gpus" \
+        --num-nodes "$num_nodes" \
+        --batch-size 1 \
+        --test-batch-size $test_batch_size \
+        --num-workers 32 \
+        --duration-sec $duration_sec \
+        --results-dir $result_path \

setup.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from setuptools import setup, find_packages
+setup(
+    name='thinksound',
+    version='0.0.16',
+    url='https://github.com/liuhuadai/thinksound.git',
+    author='liuhuadai',
+    description='a unified Any2Audio generation framework guided by Chain-of-Thought (CoT) reasoning',
+    packages=find_packages(),
+    install_requires=[
+        'aeiou==0.0.20',
+        'alias-free-torch==0.0.6',
+        'auraloss==0.4.0',
+        'descript-audio-codec==1.0.0',
+        'einops==0.7.0',
+        'einops-exts==0.0.4',
+        'ema-pytorch==0.2.3',
+        'encodec==0.1.1',
+        # 'gradio>=3.42.0',
+        'huggingface_hub',
+        'importlib-resources==5.12.0',
+        'k-diffusion==0.1.1',
+        'laion-clap==1.1.4',
+        'local-attention==1.8.6',
+        'pandas==2.0.2',
+        'pedalboard==0.7.4',
+        'prefigure==0.0.9',
+        'pytorch_lightning==2.1.0',
+        'PyWavelets==1.4.1',
+        'safetensors',
+        'sentencepiece==0.1.99',
+        's3fs',
+        'torch>=2.0.1',
+        'torchaudio>=2.0.2',
+        'torchmetrics==0.11.4',
+        'tqdm',
+        'transformers',
+        'v-diffusion-pytorch==0.0.2',
+        'vector-quantize-pytorch==1.9.14',
+        'wandb==0.15.4',
+        'webdataset==0.2.48',
+        'x-transformers<1.27.0'
+    ],
+)