Spaces:

FunAudioLLM
/

ThinkSound

Running on Zero

App Files Files Community

liuhuadai commited on 2 days ago

Commit

052cf68

1 Parent(s): 70bc476

support cot

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

{think_sound → ThinkSound}/__init__.py +0 -0
{think_sound/configs/model_configs/autoencoders → ThinkSound/configs/model_configs}/stable_audio_2_0_vae.json +0 -0
think_sound/configs/model_configs/vt2audio/latent_clip_224_text_sync_mmdit_flow_logit_t5_kernel_size3.json → ThinkSound/configs/model_configs/thinksound.json +1 -1
ThinkSound/configs/multimodal_dataset_demo.json +53 -0
{data_utils → ThinkSound/data}/__init__.py +0 -0
{think_sound → ThinkSound}/data/datamodule.py +4 -2
{think_sound → ThinkSound}/data/dataset.py +6 -8
{think_sound → ThinkSound}/data/utils.py +0 -0
{think_sound/data → ThinkSound/inference}/__init__.py +0 -0
{think_sound → ThinkSound}/inference/generation.py +0 -0
{think_sound → ThinkSound}/inference/sampling.py +0 -0
{think_sound → ThinkSound}/inference/utils.py +0 -0
{think_sound → ThinkSound}/models/__init__.py +0 -0
{think_sound → ThinkSound}/models/autoencoders.py +0 -0
{think_sound → ThinkSound}/models/blocks.py +92 -1
{think_sound → ThinkSound}/models/bottleneck.py +0 -0
{think_sound → ThinkSound}/models/codebook_patterns.py +0 -0
{think_sound → ThinkSound}/models/conditioners.py +0 -1
{think_sound → ThinkSound}/models/diffusion.py +1 -3
{think_sound → ThinkSound}/models/dit.py +0 -0
{think_sound/models/mmmodules/model → ThinkSound/models}/embeddings.py +36 -0
{think_sound → ThinkSound}/models/factory.py +0 -0
{think_sound → ThinkSound}/models/local_attention.py +0 -0
{think_sound → ThinkSound}/models/mmdit.py +56 -9
{think_sound → ThinkSound}/models/pretrained.py +0 -0
{think_sound → ThinkSound}/models/pretransforms.py +0 -0
{think_sound → ThinkSound}/models/transformer.py +0 -0
{think_sound/models/mmmodules/model → ThinkSound/models}/transformer_layers.py +2 -2
{think_sound → ThinkSound}/models/utils.py +0 -0
{think_sound → ThinkSound}/training/__init__.py +0 -0
{think_sound → ThinkSound}/training/autoencoders.py +0 -1
{think_sound → ThinkSound}/training/diffusion.py +1 -948
{think_sound → ThinkSound}/training/factory.py +0 -0
{think_sound → ThinkSound}/training/losses/__init__.py +0 -0
{think_sound → ThinkSound}/training/losses/auraloss.py +0 -0
{think_sound → ThinkSound}/training/losses/losses.py +0 -0
{think_sound → ThinkSound}/training/utils.py +0 -0
app.py +50 -59
cot_vgg_demo_caption.txt +1 -0
data_utils/__pycache__/__init__.cpython-310.pyc +0 -0
data_utils/__pycache__/utils.cpython-310.pyc +0 -0
data_utils/__pycache__/utils.cpython-39.pyc +0 -0
data_utils/ext/synchformer/__pycache__/__init__.cpython-310.pyc +0 -0
data_utils/ext/synchformer/__pycache__/__init__.cpython-39.pyc +0 -0
data_utils/ext/synchformer/__pycache__/motionformer.cpython-310.pyc +0 -0
data_utils/ext/synchformer/__pycache__/motionformer.cpython-39.pyc +0 -0
data_utils/ext/synchformer/__pycache__/synchformer.cpython-310.pyc +0 -0
data_utils/ext/synchformer/__pycache__/synchformer.cpython-39.pyc +0 -0
data_utils/ext/synchformer/__pycache__/utils.cpython-310.pyc +0 -0
data_utils/ext/synchformer/__pycache__/utils.cpython-39.pyc +0 -0

{think_sound → ThinkSound}/__init__.py RENAMED Viewed

File without changes

{think_sound/configs/model_configs/autoencoders → ThinkSound/configs/model_configs}/stable_audio_2_0_vae.json RENAMED Viewed

File without changes

think_sound/configs/model_configs/vt2audio/latent_clip_224_text_sync_mmdit_flow_logit_t5_kernel_size3.json → ThinkSound/configs/model_configs/thinksound.json RENAMED Viewed

@@ -85,7 +85,7 @@
                 "clip_dim":1024,
                 "sync_dim":768,
                 "text_dim":2048,
-                "hidden_dim":1024 ,
                 "depth":21,
                 "fused_depth":14,
                 "num_heads":16,

                 "clip_dim":1024,
                 "sync_dim":768,
                 "text_dim":2048,
+                "hidden_dim":1024,
                 "depth":21,
                 "fused_depth":14,
                 "num_heads":16,

ThinkSound/configs/multimodal_dataset_demo.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+    "dataset_type": "multimodal_dir",
+    "video_datasets": [
+        {
+            "id": "vggsound",
+            "path": "dataset/vggsound/video_latents_t5_clip_npz/train",
+            "split_path": "dataset/vggsound/split_txt/train_cot.txt"
+        }
+    ],
+    "audio_datasets": [
+        {
+            "id": "audiostock",
+            "path": "dataset/Laion-Audio-630k/audiostock_latents_npz",
+            "split_path": "dataset/Laion-Audio-630k/split_txt/cot_audiostock_1.txt"
+        },
+        {
+            "id": "freesound_no_overlap",
+            "path": "dataset/Laion-Audio-630k/freesound_no_overlap_latents_npz",
+            "split_path": "dataset/Laion-Audio-630k/split_txt/cot_freesound.txt"
+        },
+        {
+            "id": "audioset_sl",
+            "path": "dataset/wavcaps/audioset_sl_latents_npz",
+            "split_path": "dataset/wavcaps/split_txt/cot_audio_sl_1.txt"
+        },
+        {
+            "id": "audiocaps",
+            "path": "dataset/1_audiocaps/audiocaps_latents_npz",
+            "split_path": "dataset/1_audiocaps/split_txt/train_cot.txt"
+        },
+        {
+            "id": "bbc",
+            "path": "dataset/Laion-Audio-630k/bbc_latents_npz",
+            "split_path": "dataset/Laion-Audio-630k/split_txt/cot_bbc_1.txt"
+        }
+    ],
+    "val_datasets": [
+        {
+            "id": "vggsound",
+            "path": "dataset/vggsound/video_latents_t5_clip_npz/test",
+            "split_path": "dataset/vggsound/split_txt/test_cot.txt"
+        }
+    ],
+    "test_datasets": [
+        {
+            "id": "vggsound",
+            "path": "cot_coarse",
+            "split_path": "cot_vgg_demo_caption.txt"
+        }
+    ],
+    "random_crop": true,
+    "input_type": "prompt"
+}

{data_utils → ThinkSound/data}/__init__.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/data/datamodule.py RENAMED Viewed

@@ -33,13 +33,14 @@ def get_configs(audio_configs):
     return configs
 class DataModule(L.LightningDataModule):
-    def __init__(self, dataset_config, batch_size, test_batch_size, sample_size, sample_rate, audio_channels=2, num_workers=4,repeat_num=5):
         super().__init__()
         dataset_type = dataset_config.get("dataset_type", None)
         self.batch_size = batch_size
         self.num_workers = num_workers
         self.test_batch_size = test_batch_size
         self.repeat_num = repeat_num
         assert dataset_type is not None, "Dataset type must be specified in dataset config"
         if audio_channels == 1:
@@ -140,7 +141,8 @@ class DataModule(L.LightningDataModule):
                 random_crop=random_crop,
                 input_type=self.input_type,
                 fps=self.input_type,
-                force_channels=self.force_channels
             )
         if stage == 'fit':

     return configs
 class DataModule(L.LightningDataModule):
+    def __init__(self, dataset_config, batch_size, test_batch_size, sample_size, sample_rate, audio_channels=2, num_workers=4,repeat_num=5,latent_length=194):
         super().__init__()
         dataset_type = dataset_config.get("dataset_type", None)
         self.batch_size = batch_size
         self.num_workers = num_workers
         self.test_batch_size = test_batch_size
         self.repeat_num = repeat_num
+        self.latent_length = latent_length
         assert dataset_type is not None, "Dataset type must be specified in dataset config"
         if audio_channels == 1:
                 random_crop=random_crop,
                 input_type=self.input_type,
                 fps=self.input_type,
+                force_channels=self.force_channels,
+                latent_length=self.latent_length
             )
         if stage == 'fit':

{think_sound → ThinkSound}/data/dataset.py RENAMED Viewed

@@ -342,8 +342,7 @@ class LatentDataset(torch.utils.data.Dataset):
         info = {}
         audio, video = self.load_file(audio_filename, info)
         info["path"] = audio_filename
-        assert audio.shape == (64,194), f'{audio.shape} input error, id: {id}'
-        assert video.shape == (72,1024), f'{video.shape} input error, id: {id}'
         info['id'] = Path(audio_filename).stem
         for root_path in self.root_paths:
             if root_path in audio_filename:
@@ -434,8 +433,7 @@ class AudioDataset(torch.utils.data.Dataset):
         info = {}
         audio, video = self.load_file(audio_filename, info)
         info["path"] = audio_filename
-        assert audio.shape == (64,194), f'{audio.shape} input error, id: {id}'
-        assert video.shape == (72,1024), f'{video.shape} input error, id: {id}'
         info['id'] = Path(audio_filename).stem
         for root_path in self.root_paths:
             if root_path in audio_filename:
@@ -454,8 +452,9 @@ class VideoDataset(torch.utils.data.Dataset):
         input_type="prompt",
         fps=4,
         force_channels="stereo",
     ):
         super().__init__()
         self.filenames = []
         print(f'configs: {configs[0]}')
@@ -523,7 +522,7 @@ class VideoDataset(torch.utils.data.Dataset):
         if 'latent' in data.keys():
             audio = data['latent']
         else:
-            audio = torch.zeros(64,194)
         info['video_exist'] = self.video_exist
         # except:
         #     print(f'error load file: {filename}')
@@ -540,8 +539,7 @@ class VideoDataset(torch.utils.data.Dataset):
         info = {}
         audio, video = self.load_file(audio_filename, info)
         info["path"] = audio_filename
-        assert audio is None or audio.shape == (64,194), f'{audio.shape} input error, id: {id}'
-        assert video.shape == (72,1024), f'{video.shape} input error, id: {id}'
         info['id'] = Path(audio_filename).stem
         for root_path in self.root_paths:
             if root_path in audio_filename:

         info = {}
         audio, video = self.load_file(audio_filename, info)
         info["path"] = audio_filename
         info['id'] = Path(audio_filename).stem
         for root_path in self.root_paths:
             if root_path in audio_filename:
         info = {}
         audio, video = self.load_file(audio_filename, info)
         info["path"] = audio_filename
         info['id'] = Path(audio_filename).stem
         for root_path in self.root_paths:
             if root_path in audio_filename:
         input_type="prompt",
         fps=4,
         force_channels="stereo",
+        latent_length=194,  # default latent length for video dataset
     ):
+        self.latent_length = latent_length
         super().__init__()
         self.filenames = []
         print(f'configs: {configs[0]}')
         if 'latent' in data.keys():
             audio = data['latent']
         else:
+            audio = torch.zeros(64,self.latent_length)
         info['video_exist'] = self.video_exist
         # except:
         #     print(f'error load file: {filename}')
         info = {}
         audio, video = self.load_file(audio_filename, info)
         info["path"] = audio_filename
         info['id'] = Path(audio_filename).stem
         for root_path in self.root_paths:
             if root_path in audio_filename:

{think_sound → ThinkSound}/data/utils.py RENAMED Viewed

File without changes

{think_sound/data → ThinkSound/inference}/__init__.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/inference/generation.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/inference/sampling.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/inference/utils.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/models/__init__.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/models/autoencoders.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/models/blocks.py RENAMED Viewed

@@ -336,4 +336,95 @@ class SnakeBeta(nn.Module):
             beta = torch.exp(beta)
         x = snake_beta(x, alpha, beta)
-        return x

             beta = torch.exp(beta)
         x = snake_beta(x, alpha, beta)
+        return x
+class ChannelLastConv1d(nn.Conv1d):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.permute(0, 2, 1)
+        x = super().forward(x)
+        x = x.permute(0, 2, 1)
+        return x
+# https://github.com/Stability-AI/sd3-ref
+class MLP(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int = 256,
+    ):
+        """
+        Initialize the FeedForward module.
+        Args:
+            dim (int): Input dimension.
+            hidden_dim (int): Hidden dimension of the feedforward layer.
+            multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
+        Attributes:
+            w1 (ColumnParallelLinear): Linear transformation for the first layer.
+            w2 (RowParallelLinear): Linear transformation for the second layer.
+            w3 (ColumnParallelLinear): Linear transformation for the third layer.
+        """
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+class ConvMLP(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int = 256,
+        kernel_size: int = 3,
+        padding: int = 1,
+    ):
+        """
+        Initialize the FeedForward module.
+        Args:
+            dim (int): Input dimension.
+            hidden_dim (int): Hidden dimension of the feedforward layer.
+            multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
+        Attributes:
+            w1 (ColumnParallelLinear): Linear transformation for the first layer.
+            w2 (RowParallelLinear): Linear transformation for the second layer.
+            w3 (ColumnParallelLinear): Linear transformation for the third layer.
+        """
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = ChannelLastConv1d(dim,
+                                    hidden_dim,
+                                    bias=False,
+                                    kernel_size=kernel_size,
+                                    padding=padding)
+        self.w2 = ChannelLastConv1d(hidden_dim,
+                                    dim,
+                                    bias=False,
+                                    kernel_size=kernel_size,
+                                    padding=padding)
+        self.w3 = ChannelLastConv1d(dim,
+                                    hidden_dim,
+                                    bias=False,
+                                    kernel_size=kernel_size,
+                                    padding=padding)
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))

{think_sound → ThinkSound}/models/bottleneck.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/models/codebook_patterns.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/models/conditioners.py RENAMED Viewed

@@ -7,7 +7,6 @@ import typing as tp
 import gc
 from typing import Literal, Optional
 import os
-from .adp import NumberEmbedder
 from ..inference.utils import set_audio_channels
 from .factory import create_pretransform_from_config
 from .pretransforms import Pretransform

 import gc
 from typing import Literal, Optional
 import os
 from ..inference.utils import set_audio_channels
 from .factory import create_pretransform_from_config
 from .pretransforms import Pretransform

{think_sound → ThinkSound}/models/diffusion.py RENAMED Viewed

@@ -7,14 +7,12 @@ import typing as tp
 from .blocks import ResConvBlock, FourierFeatures, Upsample1d, Upsample1d_2, Downsample1d, Downsample1d_2, SelfAttention1d, SkipBlock, expand_to_planes
 from .conditioners import MultiConditioner, create_multi_conditioner_from_conditioning_config
-from .dit import DiffusionTransformer
 from .mmdit import MMAudio
 from .factory import create_pretransform_from_config
 from .pretransforms import Pretransform
 from ..inference.generation import generate_diffusion_cond
-from .adp import UNetCFG1d, UNet1d
 from time import time
 class Profiler:

 from .blocks import ResConvBlock, FourierFeatures, Upsample1d, Upsample1d_2, Downsample1d, Downsample1d_2, SelfAttention1d, SkipBlock, expand_to_planes
 from .conditioners import MultiConditioner, create_multi_conditioner_from_conditioning_config
+# from .dit import DiffusionTransformer
 from .mmdit import MMAudio
 from .factory import create_pretransform_from_config
 from .pretransforms import Pretransform
 from ..inference.generation import generate_diffusion_cond
 from time import time
 class Profiler:

{think_sound → ThinkSound}/models/dit.py RENAMED Viewed

File without changes

{think_sound/models/mmmodules/model → ThinkSound/models}/embeddings.py RENAMED Viewed

@@ -3,6 +3,42 @@ import torch.nn as nn
 # https://github.com/facebookresearch/DiT
 class TimestepEmbedder(nn.Module):
     """

 # https://github.com/facebookresearch/DiT
+from typing import Union
+import torch
+from einops import rearrange
+from torch import Tensor
+# Ref: https://github.com/black-forest-labs/flux/blob/main/src/flux/math.py
+# Ref: https://github.com/lucidrains/rotary-embedding-torch
+def compute_rope_rotations(length: int,
+                           dim: int,
+                           theta: int,
+                           *,
+                           freq_scaling: float = 1.0,
+                           device: Union[torch.device, str] = 'cpu') -> Tensor:
+    assert dim % 2 == 0
+    with torch.amp.autocast(device_type='cuda', enabled=False):
+        pos = torch.arange(length, dtype=torch.float32, device=device)
+        freqs = 1.0 / (theta**(torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim))
+        freqs *= freq_scaling
+        rot = torch.einsum('..., f -> ... f', pos, freqs)
+        rot = torch.stack([torch.cos(rot), -torch.sin(rot), torch.sin(rot), torch.cos(rot)], dim=-1)
+        rot = rearrange(rot, 'n d (i j) -> 1 n d i j', i=2, j=2)
+        return rot
+def apply_rope(x: Tensor, rot: Tensor) -> tuple[Tensor, Tensor]:
+    with torch.amp.autocast(device_type='cuda', enabled=False):
+        _x = x.float()
+        _x = _x.view(*_x.shape[:-1], -1, 1, 2)
+        x_out = rot[..., 0] * _x[..., 0] + rot[..., 1] * _x[..., 1]
+        return x_out.reshape(*x.shape).to(dtype=x.dtype)
 class TimestepEmbedder(nn.Module):
     """

{think_sound → ThinkSound}/models/factory.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/models/local_attention.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/models/mmdit.py RENAMED Viewed

@@ -6,10 +6,10 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import sys
-from .mmmodules.ext.rotary_embeddings import compute_rope_rotations
-from .mmmodules.model.embeddings import TimestepEmbedder
-from .mmmodules.model.low_level import MLP, ChannelLastConv1d, ConvMLP
-from .mmmodules.model.transformer_layers import (FinalBlock, JointBlock, MMDitSingleBlock)
 from .utils import resample
 log = logging.getLogger()
@@ -24,7 +24,6 @@ class PreprocessedConditions:
     text_f_c: torch.Tensor
-# Partially from https://github.com/facebookresearch/DiT
 class MMAudio(nn.Module):
     def __init__(self,
@@ -94,7 +93,6 @@ class MMAudio(nn.Module):
                 nn.Linear(hidden_dim * 4, hidden_dim, bias=False),
                 nn.Sigmoid()
             )
-            # 初始化最后一层权重为零，促进初始均匀融合
             nn.init.zeros_(self.gated_mlp_v[3].weight)
             nn.init.zeros_(self.gated_mlp_t[3].weight)
         if v2:
@@ -441,9 +439,9 @@ class MMAudio(nn.Module):
             # clip_f = torch.cat([clip_f,empty_clip_f], dim=0)
             # sync_f = torch.cat([sync_f,empty_sync_f], dim=0)
             # text_f = torch.cat([text_f,empty_text_f], dim=0)
-            clip_f = torch.cat([clip_f,self.get_empty_clip_sequence(bsz)], dim=0)
-            sync_f = torch.cat([sync_f,self.get_empty_sync_sequence(bsz)], dim=0)
-            text_f = torch.cat([text_f,self.get_empty_string_sequence(bsz)], dim=0)
             if t5_features is not None:
                 empty_t5_features = torch.zeros_like(t5_features, device=latent.device)
                 # t5_features = torch.cat([t5_features,empty_t5_features], dim=0)
@@ -529,3 +527,52 @@ class MMAudio(nn.Module):
     def sync_seq_len(self) -> int:
         return self._sync_seq_len

 import torch.nn as nn
 import torch.nn.functional as F
 import sys
+from .embeddings import compute_rope_rotations
+from .embeddings import TimestepEmbedder
+from .blocks import MLP, ChannelLastConv1d, ConvMLP
+from .transformer_layers import (FinalBlock, JointBlock, MMDitSingleBlock)
 from .utils import resample
 log = logging.getLogger()
     text_f_c: torch.Tensor
 class MMAudio(nn.Module):
     def __init__(self,
                 nn.Linear(hidden_dim * 4, hidden_dim, bias=False),
                 nn.Sigmoid()
             )
             nn.init.zeros_(self.gated_mlp_v[3].weight)
             nn.init.zeros_(self.gated_mlp_t[3].weight)
         if v2:
             # clip_f = torch.cat([clip_f,empty_clip_f], dim=0)
             # sync_f = torch.cat([sync_f,empty_sync_f], dim=0)
             # text_f = torch.cat([text_f,empty_text_f], dim=0)
+            clip_f = safe_cat(clip_f,self.get_empty_clip_sequence(bsz), dim=0, match_dim=1)
+            sync_f = safe_cat(sync_f,self.get_empty_sync_sequence(bsz), dim=0, match_dim=1)
+            text_f = safe_cat(text_f,self.get_empty_string_sequence(bsz), dim=0, match_dim=1)
             if t5_features is not None:
                 empty_t5_features = torch.zeros_like(t5_features, device=latent.device)
                 # t5_features = torch.cat([t5_features,empty_t5_features], dim=0)
     def sync_seq_len(self) -> int:
         return self._sync_seq_len
+def truncate_to_target(tensor, target_size, dim=1):
+    current_size = tensor.size(dim)
+    if current_size > target_size:
+        slices = [slice(None)] * tensor.dim()
+        slices[dim] = slice(0, target_size)
+        return tensor[slices]
+    return tensor
+def pad_to_target(tensor, target_size, dim=1, pad_value=0):
+    current_size = tensor.size(dim)
+    if current_size < target_size:
+        pad_size = target_size - current_size
+        pad_config = [0, 0] * tensor.dim()
+        pad_index = 2 * (tensor.dim() - dim - 1) + 1
+        pad_config[pad_index] = pad_size
+        return torch.nn.functional.pad(tensor, pad_config, value=pad_value)
+    return tensor
+def safe_cat(tensor1, tensor2, dim=0, match_dim=1):
+    target_size = tensor2.size(match_dim)
+    if tensor1.size(match_dim) > target_size:
+        tensor1 = truncate_to_target(tensor1, target_size, match_dim)
+    else:
+        tensor1 = pad_to_target(tensor1, target_size, match_dim)
+    return torch.cat([tensor1, tensor2], dim=dim)

{think_sound → ThinkSound}/models/pretrained.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/models/pretransforms.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/models/transformer.py RENAMED Viewed

File without changes

{think_sound/models/mmmodules/model → ThinkSound/models}/transformer_layers.py RENAMED Viewed

@@ -6,8 +6,8 @@ import torch.nn.functional as F
 from einops import rearrange
 from einops.layers.torch import Rearrange
-from ..ext.rotary_embeddings import apply_rope
-from ..model.low_level import MLP, ChannelLastConv1d, ConvMLP
 try:
     from flash_attn import flash_attn_func, flash_attn_kvpacked_func
     print('flash_attn installed, using Flash Attention')

 from einops import rearrange
 from einops.layers.torch import Rearrange
+from .embeddings import apply_rope
+from .blocks import MLP, ChannelLastConv1d, ConvMLP
 try:
     from flash_attn import flash_attn_func, flash_attn_kvpacked_func
     print('flash_attn installed, using Flash Attention')

{think_sound → ThinkSound}/models/utils.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/training/__init__.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/training/autoencoders.py RENAMED Viewed

@@ -9,7 +9,6 @@ from .losses.auraloss import SumAndDifferenceSTFTLoss, MultiResolutionSTFTLoss,
 import lightning as L
 from lightning.pytorch.callbacks import Callback
 from ..models.autoencoders import AudioAutoencoder
-from ..models.discriminators import EncodecDiscriminator, OobleckDiscriminator, DACGANLoss
 from ..models.bottleneck import VAEBottleneck, RVQBottleneck, DACRVQBottleneck, DACRVQVAEBottleneck, RVQVAEBottleneck, WassersteinBottleneck
 from .losses import MultiLoss, AuralossLoss, ValueLoss, L1Loss
 from .utils import create_optimizer_from_config, create_scheduler_from_config

 import lightning as L
 from lightning.pytorch.callbacks import Callback
 from ..models.autoencoders import AudioAutoencoder
 from ..models.bottleneck import VAEBottleneck, RVQBottleneck, DACRVQBottleneck, DACRVQVAEBottleneck, RVQVAEBottleneck, WassersteinBottleneck
 from .losses import MultiLoss, AuralossLoss, ValueLoss, L1Loss
 from .utils import create_optimizer_from_config, create_scheduler_from_config

{think_sound → ThinkSound}/training/diffusion.py RENAMED Viewed

@@ -20,7 +20,6 @@ from pytorch_lightning.utilities.rank_zero import rank_zero_only
 from ..inference.sampling import get_alphas_sigmas, sample, sample_discrete_euler
 from ..models.diffusion import DiffusionModelWrapper, ConditionedDiffusionModelWrapper
 from ..models.autoencoders import DiffusionAutoencoder
-from ..models.diffusion_prior import PriorType
 from .autoencoders import create_loss_modules_from_bottleneck
 from .losses import AuralossLoss, MSELoss, MultiLoss
 from .utils import create_optimizer_from_config, create_scheduler_from_config, mask_from_frac_lengths, generate_mask, generate_channel_mask
@@ -846,10 +845,9 @@ class DiffusionCondTrainingWrapper(L.LightningModule):
     def predict_step(self, batch, batch_idx):
         reals, metadata = batch
-        # import ipdb
-        # ipdb.set_trace()
         ids = [item['id'] for item in metadata]
         batch_size, length = reals.shape[0], reals.shape[2]
         with torch.amp.autocast('cuda'):
             conditioning = self.diffusion.conditioner(metadata, self.device)
@@ -878,7 +876,6 @@ class DiffusionCondTrainingWrapper(L.LightningModule):
                 end_time = time.time()
                 execution_time = end_time - start_time
                 print(f"执行时间: {execution_time:.2f} 秒")
-                breakpoint()
             if self.diffusion.pretransform is not None:
                 fakes = self.diffusion.pretransform.decode(fakes)
@@ -1077,947 +1074,3 @@ class DiffusionCondDemoCallback(Callback):
             gc.collect()
             torch.cuda.empty_cache()
             module.train()
-class DiffusionCondInpaintTrainingWrapper(L.LightningModule):
-    '''
-    Wrapper for training a conditional audio diffusion model.
-    '''
-    def __init__(
-            self,
-            model: ConditionedDiffusionModelWrapper,
-            lr: float = 1e-4,
-            max_mask_segments = 10,
-            log_loss_info: bool = False,
-            optimizer_configs: dict = None,
-            use_ema: bool = True,
-            pre_encoded: bool = False,
-            cfg_dropout_prob = 0.1,
-            timestep_sampler: tp.Literal["uniform", "logit_normal"] = "uniform",
-    ):
-        super().__init__()
-        self.diffusion = model
-        self.use_ema = use_ema
-        if self.use_ema:
-            self.diffusion_ema = EMA(
-                self.diffusion.model,
-                beta=0.9999,
-                power=3/4,
-                update_every=1,
-                update_after_step=1,
-                include_online_model=False
-            )
-        else:
-            self.diffusion_ema = None
-        self.cfg_dropout_prob = cfg_dropout_prob
-        self.lr = lr
-        self.max_mask_segments = max_mask_segments
-        self.rng = torch.quasirandom.SobolEngine(1, scramble=True)
-        self.timestep_sampler = timestep_sampler
-        self.diffusion_objective = model.diffusion_objective
-        self.loss_modules = [
-            MSELoss("output",
-                   "targets",
-                   weight=1.0,
-                   name="mse_loss"
-            )
-        ]
-        self.losses = MultiLoss(self.loss_modules)
-        self.log_loss_info = log_loss_info
-        assert lr is not None or optimizer_configs is not None, "Must specify either lr or optimizer_configs in training config"
-        if optimizer_configs is None:
-            optimizer_configs = {
-                "diffusion": {
-                    "optimizer": {
-                        "type": "Adam",
-                        "config": {
-                            "lr": lr
-                        }
-                    }
-                }
-            }
-        else:
-            if lr is not None:
-                print(f"WARNING: learning_rate and optimizer_configs both specified in config. Ignoring learning_rate and using optimizer_configs.")
-        self.optimizer_configs = optimizer_configs
-        self.pre_encoded = pre_encoded
-    def configure_optimizers(self):
-        diffusion_opt_config = self.optimizer_configs['diffusion']
-        opt_diff = create_optimizer_from_config(diffusion_opt_config['optimizer'], self.diffusion.parameters())
-        if "scheduler" in diffusion_opt_config:
-            sched_diff = create_scheduler_from_config(diffusion_opt_config['scheduler'], opt_diff)
-            sched_diff_config = {
-                "scheduler": sched_diff,
-                "interval": "step"
-            }
-            return [opt_diff], [sched_diff_config]
-        return [opt_diff]
-    def random_mask(self, sequence, max_mask_length):
-        b, _, sequence_length = sequence.size()
-        # Create a mask tensor for each batch element
-        masks = []
-        for i in range(b):
-            mask_type = random.randint(0, 2)
-            if mask_type == 0:  # Random mask with multiple segments
-                num_segments = random.randint(1, self.max_mask_segments)
-                max_segment_length = max_mask_length // num_segments
-                segment_lengths = random.sample(range(1, max_segment_length + 1), num_segments)
-                mask = torch.ones((1, 1, sequence_length))
-                for length in segment_lengths:
-                    mask_start = random.randint(0, sequence_length - length)
-                    mask[:, :, mask_start:mask_start + length] = 0
-            elif mask_type == 1:  # Full mask
-                mask = torch.zeros((1, 1, sequence_length))
-            elif mask_type == 2:  # Causal mask
-                mask = torch.ones((1, 1, sequence_length))
-                mask_length = random.randint(1, max_mask_length)
-                mask[:, :, -mask_length:] = 0
-            mask = mask.to(sequence.device)
-            masks.append(mask)
-        # Concatenate the mask tensors into a single tensor
-        mask = torch.cat(masks, dim=0).to(sequence.device)
-        # Apply the mask to the sequence tensor for each batch element
-        masked_sequence = sequence * mask
-        return masked_sequence, mask
-    def training_step(self, batch, batch_idx):
-        reals, metadata = batch
-        p = Profiler()
-        if reals.ndim == 4 and reals.shape[0] == 1:
-            reals = reals[0]
-        loss_info = {}
-        diffusion_input = reals
-        if not self.pre_encoded:
-            loss_info["audio_reals"] = diffusion_input
-        p.tick("setup")
-        with torch.amp.autocast('cuda'):
-            conditioning = self.diffusion.conditioner(metadata, self.device)
-        p.tick("conditioning")
-        if self.diffusion.pretransform is not None:
-            self.diffusion.pretransform.to(self.device)
-            if not self.pre_encoded:
-                with torch.amp.autocast('cuda') and torch.set_grad_enabled(self.diffusion.pretransform.enable_grad):
-                    diffusion_input = self.diffusion.pretransform.encode(diffusion_input)
-                    p.tick("pretransform")
-                    # If mask_padding is on, interpolate the padding masks to the size of the pretransformed input
-                    # if use_padding_mask:
-                    #     padding_masks = F.interpolate(padding_masks.unsqueeze(1).float(), size=diffusion_input.shape[2], mode="nearest").squeeze(1).bool()
-            else:
-                # Apply scale to pre-encoded latents if needed, as the pretransform encode function will not be run
-                if hasattr(self.diffusion.pretransform, "scale") and self.diffusion.pretransform.scale != 1.0:
-                    diffusion_input = diffusion_input / self.diffusion.pretransform.scale
-        # Max mask size is the full sequence length
-        max_mask_length = diffusion_input.shape[2]
-        # Create a mask of random length for a random slice of the input
-        masked_input, mask = self.random_mask(diffusion_input, max_mask_length)
-        # conditioning['inpaint_mask'] = [mask]
-        conditioning['inpaint_masked_input'] = [masked_input]
-        if self.timestep_sampler == "uniform":
-            # Draw uniformly distributed continuous timesteps
-            t = self.rng.draw(reals.shape[0])[:, 0].to(self.device)
-        elif self.timestep_sampler == "logit_normal":
-            t = torch.sigmoid(torch.randn(reals.shape[0], device=self.device))
-        # Calculate the noise schedule parameters for those timesteps
-        if self.diffusion_objective == "v":
-            alphas, sigmas = get_alphas_sigmas(t)
-        elif self.diffusion_objective == "rectified_flow":
-            alphas, sigmas = 1-t, t
-        # Combine the ground truth data and the noise
-        alphas = alphas[:, None, None]
-        sigmas = sigmas[:, None, None]
-        noise = torch.randn_like(diffusion_input)
-        noised_inputs = diffusion_input * alphas + noise * sigmas
-        if self.diffusion_objective == "v":
-            targets = noise * alphas - diffusion_input * sigmas
-        elif self.diffusion_objective == "rectified_flow":
-            targets = noise - diffusion_input
-        p.tick("noise")
-        extra_args = {}
-        with torch.amp.autocast('cuda'):
-            p.tick("amp")
-            output = self.diffusion(noised_inputs, t, cond=conditioning, cfg_dropout_prob = self.cfg_dropout_prob, **extra_args)
-            p.tick("diffusion")
-            loss_info.update({
-                "output": output,
-                "targets": targets,
-            })
-            loss, losses = self.losses(loss_info)
-            if self.log_loss_info:
-                # Loss debugging logs
-                num_loss_buckets = 10
-                bucket_size = 1 / num_loss_buckets
-                loss_all = F.mse_loss(output, targets, reduction="none")
-                sigmas = rearrange(self.all_gather(sigmas), "w b c n -> (w b) c n").squeeze()
-                # gather loss_all across all GPUs
-                loss_all = rearrange(self.all_gather(loss_all), "w b c n -> (w b) c n")
-                # Bucket loss values based on corresponding sigma values, bucketing sigma values by bucket_size
-                loss_all = torch.stack([loss_all[(sigmas >= i) & (sigmas < i + bucket_size)].mean() for i in torch.arange(0, 1, bucket_size).to(self.device)])
-                # Log bucketed losses with corresponding sigma bucket values, if it's not NaN
-                debug_log_dict = {
-                    f"model/loss_all_{i/num_loss_buckets:.1f}": loss_all[i].detach() for i in range(num_loss_buckets) if not torch.isnan(loss_all[i])
-                }
-                self.log_dict(debug_log_dict)
-        log_dict = {
-            'train/loss': loss.detach(),
-            'train/std_data': diffusion_input.std(),
-            'train/lr': self.trainer.optimizers[0].param_groups[0]['lr']
-        }
-        for loss_name, loss_value in losses.items():
-            log_dict[f"train/{loss_name}"] = loss_value.detach()
-        self.log_dict(log_dict, prog_bar=True, on_step=True)
-        p.tick("log")
-        #print(f"Profiler: {p}")
-        return loss
-    def on_before_zero_grad(self, *args, **kwargs):
-        if self.diffusion_ema is not None:
-            self.diffusion_ema.update()
-    def export_model(self, path, use_safetensors=False):
-        if self.diffusion_ema is not None:
-            self.diffusion.model = self.diffusion_ema.ema_model
-        if use_safetensors:
-            save_file(self.diffusion.state_dict(), path)
-        else:
-            torch.save({"state_dict": self.diffusion.state_dict()}, path)
-class DiffusionCondInpaintDemoCallback(Callback):
-    def __init__(
-        self,
-        demo_dl,
-        demo_every=2000,
-        demo_steps=250,
-        sample_size=65536,
-        sample_rate=48000,
-        demo_cfg_scales: tp.Optional[tp.List[int]] = [3, 5, 7]
-    ):
-        super().__init__()
-        self.demo_every = demo_every
-        self.demo_steps = demo_steps
-        self.demo_samples = sample_size
-        self.demo_dl = iter(demo_dl)
-        self.sample_rate = sample_rate
-        self.demo_cfg_scales = demo_cfg_scales
-        self.last_demo_step = -1
-    @rank_zero_only
-    @torch.no_grad()
-    def on_train_batch_end(self, trainer, module: DiffusionCondTrainingWrapper, outputs, batch, batch_idx):
-        if (trainer.global_step - 1) % self.demo_every != 0 or self.last_demo_step == trainer.global_step:
-            return
-        self.last_demo_step = trainer.global_step
-        try:
-            log_dict = {}
-            demo_reals, metadata = next(self.demo_dl)
-            # Remove extra dimension added by WebDataset
-            if demo_reals.ndim == 4 and demo_reals.shape[0] == 1:
-                demo_reals = demo_reals[0]
-            demo_reals = demo_reals.to(module.device)
-            if not module.pre_encoded:
-                # Log the real audio
-                log_dict[f'demo_reals_melspec_left'] = wandb.Image(audio_spectrogram_image(rearrange(demo_reals, "b d n -> d (b n)").mul(32767).to(torch.int16).cpu()))
-                # log_dict[f'demo_reals'] = wandb.Audio(rearrange(demo_reals, "b d n -> d (b n)").mul(32767).to(torch.int16).cpu(), sample_rate=self.sample_rate, caption="demo reals")
-                if module.diffusion.pretransform is not None:
-                    module.diffusion.pretransform.to(module.device)
-                    with torch.amp.autocast('cuda'):
-                        demo_reals = module.diffusion.pretransform.encode(demo_reals)
-            demo_samples = demo_reals.shape[2]
-            # Get conditioning
-            conditioning = module.diffusion.conditioner(metadata, module.device)
-            masked_input, mask = module.random_mask(demo_reals, demo_reals.shape[2])
-            conditioning['inpaint_mask'] = [mask]
-            conditioning['inpaint_masked_input'] = [masked_input]
-            if module.diffusion.pretransform is not None:
-                log_dict[f'demo_masked_input'] = wandb.Image(tokens_spectrogram_image(masked_input.cpu()))
-            else:
-                log_dict[f'demo_masked_input'] = wandb.Image(audio_spectrogram_image(rearrange(masked_input, "b c t -> c (b t)").mul(32767).to(torch.int16).cpu()))
-            cond_inputs = module.diffusion.get_conditioning_inputs(conditioning)
-            noise = torch.randn([demo_reals.shape[0], module.diffusion.io_channels, demo_samples]).to(module.device)
-            trainer.logger.experiment.log(log_dict)
-            for cfg_scale in self.demo_cfg_scales:
-                model = module.diffusion_ema.model if module.diffusion_ema is not None else module.diffusion.model
-                print(f"Generating demo for cfg scale {cfg_scale}")
-                if module.diffusion_objective == "v":
-                    fakes = sample(model, noise, self.demo_steps, 0, **cond_inputs, cfg_scale=cfg_scale, batch_cfg=True)
-                elif module.diffusion_objective == "rectified_flow":
-                    fakes = sample_discrete_euler(model, noise, self.demo_steps, **cond_inputs, cfg_scale=cfg_scale, batch_cfg=True)
-                if module.diffusion.pretransform is not None:
-                    with torch.amp.autocast('cuda'):
-                        fakes = module.diffusion.pretransform.decode(fakes)
-                # Put the demos together
-                fakes = rearrange(fakes, 'b d n -> d (b n)')
-                log_dict = {}
-                filename = f'demo_cfg_{cfg_scale}_{trainer.global_step:08}.wav'
-                fakes = fakes.to(torch.float32).div(torch.max(torch.abs(fakes))).mul(32767).to(torch.int16).cpu()
-                torchaudio.save(filename, fakes, self.sample_rate)
-                log_dict[f'demo_cfg_{cfg_scale}'] = wandb.Audio(filename,
-                                                    sample_rate=self.sample_rate,
-                                                    caption=f'Reconstructed')
-                log_dict[f'demo_melspec_left_cfg_{cfg_scale}'] = wandb.Image(audio_spectrogram_image(fakes))
-                trainer.logger.experiment.log(log_dict)
-        except Exception as e:
-            print(f'{type(e).__name__}: {e}')
-            raise e
-class DiffusionAutoencoderTrainingWrapper(L.LightningModule):
-    '''
-    Wrapper for training a diffusion autoencoder
-    '''
-    def __init__(
-            self,
-            model: DiffusionAutoencoder,
-            lr: float = 1e-4,
-            ema_copy = None,
-            use_reconstruction_loss: bool = False
-    ):
-        super().__init__()
-        self.diffae = model
-        self.diffae_ema = EMA(
-            self.diffae,
-            ema_model=ema_copy,
-            beta=0.9999,
-            power=3/4,
-            update_every=1,
-            update_after_step=1,
-            include_online_model=False
-        )
-        self.lr = lr
-        self.rng = torch.quasirandom.SobolEngine(1, scramble=True)
-        loss_modules = [
-            MSELoss("v",
-                    "targets",
-                    weight=1.0,
-                    name="mse_loss"
-            )
-        ]
-        if model.bottleneck is not None:
-            # TODO: Use loss config for configurable bottleneck weights and reconstruction losses
-            loss_modules += create_loss_modules_from_bottleneck(model.bottleneck, {})
-        self.use_reconstruction_loss = use_reconstruction_loss
-        if use_reconstruction_loss:
-            scales = [2048, 1024, 512, 256, 128, 64, 32]
-            hop_sizes = []
-            win_lengths = []
-            overlap = 0.75
-            for s in scales:
-                hop_sizes.append(int(s * (1 - overlap)))
-                win_lengths.append(s)
-            sample_rate = model.sample_rate
-            stft_loss_args = {
-                "fft_sizes": scales,
-                "hop_sizes": hop_sizes,
-                "win_lengths": win_lengths,
-                "perceptual_weighting": True
-            }
-            out_channels = model.out_channels
-            if model.pretransform is not None:
-                out_channels = model.pretransform.io_channels
-            if out_channels == 2:
-                self.sdstft = auraloss.freq.SumAndDifferenceSTFTLoss(sample_rate=sample_rate, **stft_loss_args)
-            else:
-                self.sdstft = auraloss.freq.MultiResolutionSTFTLoss(sample_rate=sample_rate, **stft_loss_args)
-            loss_modules.append(
-                AuralossLoss(self.sdstft, 'audio_reals', 'audio_pred', name='mrstft_loss', weight=0.1), # Reconstruction loss
-            )
-        self.losses = MultiLoss(loss_modules)
-    def configure_optimizers(self):
-        return optim.Adam([*self.diffae.parameters()], lr=self.lr)
-    def training_step(self, batch, batch_idx):
-        reals = batch[0]
-        if reals.ndim == 4 and reals.shape[0] == 1:
-            reals = reals[0]
-        loss_info = {}
-        loss_info["audio_reals"] = reals
-        if self.diffae.pretransform is not None:
-            with torch.no_grad():
-                reals = self.diffae.pretransform.encode(reals)
-        loss_info["reals"] = reals
-        #Encode reals, skipping the pretransform since it was already applied
-        latents, encoder_info = self.diffae.encode(reals, return_info=True, skip_pretransform=True)
-        loss_info["latents"] = latents
-        loss_info.update(encoder_info)
-        if self.diffae.decoder is not None:
-            latents = self.diffae.decoder(latents)
-        # Upsample latents to match diffusion length
-        if latents.shape[2] != reals.shape[2]:
-            latents = F.interpolate(latents, size=reals.shape[2], mode='nearest')
-        loss_info["latents_upsampled"] = latents
-        # Draw uniformly distributed continuous timesteps
-        t = self.rng.draw(reals.shape[0])[:, 0].to(self.device)
-        # Calculate the noise schedule parameters for those timesteps
-        alphas, sigmas = get_alphas_sigmas(t)
-        # Combine the ground truth data and the noise
-        alphas = alphas[:, None, None]
-        sigmas = sigmas[:, None, None]
-        noise = torch.randn_like(reals)
-        noised_reals = reals * alphas + noise * sigmas
-        targets = noise * alphas - reals * sigmas
-        with torch.amp.autocast('cuda'):
-            v = self.diffae.diffusion(noised_reals, t, input_concat_cond=latents)
-            loss_info.update({
-                "v": v,
-                "targets": targets
-            })
-            if self.use_reconstruction_loss:
-                pred = noised_reals * alphas - v * sigmas
-                loss_info["pred"] = pred
-                if self.diffae.pretransform is not None:
-                    pred = self.diffae.pretransform.decode(pred)
-                    loss_info["audio_pred"] = pred
-            loss, losses = self.losses(loss_info)
-        log_dict = {
-            'train/loss': loss.detach(),
-            'train/std_data': reals.std(),
-            'train/latent_std': latents.std(),
-        }
-        for loss_name, loss_value in losses.items():
-            log_dict[f"train/{loss_name}"] = loss_value.detach()
-        self.log_dict(log_dict, prog_bar=True, on_step=True)
-        return loss
-    def on_before_zero_grad(self, *args, **kwargs):
-        self.diffae_ema.update()
-    def export_model(self, path, use_safetensors=False):
-        model = self.diffae_ema.ema_model
-        if use_safetensors:
-            save_file(model.state_dict(), path)
-        else:
-            torch.save({"state_dict": model.state_dict()}, path)
-class DiffusionAutoencoderDemoCallback(Callback):
-    def __init__(
-        self,
-        demo_dl,
-        demo_every=2000,
-        demo_steps=250,
-        sample_size=65536,
-        sample_rate=48000
-    ):
-        super().__init__()
-        self.demo_every = demo_every
-        self.demo_steps = demo_steps
-        self.demo_samples = sample_size
-        self.demo_dl = iter(demo_dl)
-        self.sample_rate = sample_rate
-        self.last_demo_step = -1
-    @rank_zero_only
-    @torch.no_grad()
-    def on_train_batch_end(self, trainer, module: DiffusionAutoencoderTrainingWrapper, outputs, batch, batch_idx):
-        if (trainer.global_step - 1) % self.demo_every != 0 or self.last_demo_step == trainer.global_step:
-            return
-        self.last_demo_step = trainer.global_step
-        demo_reals, _ = next(self.demo_dl)
-        # Remove extra dimension added by WebDataset
-        if demo_reals.ndim == 4 and demo_reals.shape[0] == 1:
-            demo_reals = demo_reals[0]
-        encoder_input = demo_reals
-        encoder_input = encoder_input.to(module.device)
-        demo_reals = demo_reals.to(module.device)
-        with torch.no_grad() and torch.amp.autocast('cuda'):
-            latents = module.diffae_ema.ema_model.encode(encoder_input).float()
-            fakes = module.diffae_ema.ema_model.decode(latents, steps=self.demo_steps)
-        #Interleave reals and fakes
-        reals_fakes = rearrange([demo_reals, fakes], 'i b d n -> (b i) d n')
-        # Put the demos together
-        reals_fakes = rearrange(reals_fakes, 'b d n -> d (b n)')
-        log_dict = {}
-        filename = f'recon_{trainer.global_step:08}.wav'
-        reals_fakes = reals_fakes.to(torch.float32).div(torch.max(torch.abs(reals_fakes))).mul(32767).to(torch.int16).cpu()
-        torchaudio.save(filename, reals_fakes, self.sample_rate)
-        log_dict[f'recon'] = wandb.Audio(filename,
-                                            sample_rate=self.sample_rate,
-                                            caption=f'Reconstructed')
-        log_dict[f'embeddings_3dpca'] = pca_point_cloud(latents)
-        log_dict[f'embeddings_spec'] = wandb.Image(tokens_spectrogram_image(latents))
-        log_dict[f'recon_melspec_left'] = wandb.Image(audio_spectrogram_image(reals_fakes))
-        if module.diffae_ema.ema_model.pretransform is not None:
-            with torch.no_grad() and torch.amp.autocast('cuda'):
-                initial_latents = module.diffae_ema.ema_model.pretransform.encode(encoder_input)
-                first_stage_fakes = module.diffae_ema.ema_model.pretransform.decode(initial_latents)
-                first_stage_fakes = rearrange(first_stage_fakes, 'b d n -> d (b n)')
-                first_stage_fakes = first_stage_fakes.to(torch.float32).mul(32767).to(torch.int16).cpu()
-                first_stage_filename = f'first_stage_{trainer.global_step:08}.wav'
-                torchaudio.save(first_stage_filename, first_stage_fakes, self.sample_rate)
-                log_dict[f'first_stage_latents'] = wandb.Image(tokens_spectrogram_image(initial_latents))
-                log_dict[f'first_stage'] = wandb.Audio(first_stage_filename,
-                                            sample_rate=self.sample_rate,
-                                            caption=f'First Stage Reconstructed')
-                log_dict[f'first_stage_melspec_left'] = wandb.Image(audio_spectrogram_image(first_stage_fakes))
-        trainer.logger.experiment.log(log_dict)
-def create_source_mixture(reals, num_sources=2):
-    # Create a fake mixture source by mixing elements from the training batch together with random offsets
-    source = torch.zeros_like(reals)
-    for i in range(reals.shape[0]):
-        sources_added = 0
-        js = list(range(reals.shape[0]))
-        random.shuffle(js)
-        for j in js:
-            if i == j or (i != j and sources_added < num_sources):
-                # Randomly offset the mixed element between 0 and the length of the source
-                seq_len = reals.shape[2]
-                offset = random.randint(0, seq_len-1)
-                source[i, :, offset:] += reals[j, :, :-offset]
-                if i == j:
-                    # If this is the real one, shift the reals as well to ensure alignment
-                    new_reals = torch.zeros_like(reals[i])
-                    new_reals[:, offset:] = reals[i, :, :-offset]
-                    reals[i] = new_reals
-                sources_added += 1
-    return source
-class DiffusionPriorTrainingWrapper(L.LightningModule):
-    '''
-    Wrapper for training a diffusion prior for inverse problems
-    Prior types:
-        mono_stereo: The prior is conditioned on a mono version of the audio to generate a stereo version
-    '''
-    def __init__(
-            self,
-            model: ConditionedDiffusionModelWrapper,
-            lr: float = 1e-4,
-            ema_copy = None,
-            prior_type: PriorType = PriorType.MonoToStereo,
-            use_reconstruction_loss: bool = False,
-            log_loss_info: bool = False,
-    ):
-        super().__init__()
-        self.diffusion = model
-        self.diffusion_ema = EMA(
-            self.diffusion,
-            ema_model=ema_copy,
-            beta=0.9999,
-            power=3/4,
-            update_every=1,
-            update_after_step=1,
-            include_online_model=False
-        )
-        self.lr = lr
-        self.rng = torch.quasirandom.SobolEngine(1, scramble=True)
-        self.log_loss_info = log_loss_info
-        loss_modules = [
-            MSELoss("v",
-                    "targets",
-                    weight=1.0,
-                    name="mse_loss"
-            )
-        ]
-        self.use_reconstruction_loss = use_reconstruction_loss
-        if use_reconstruction_loss:
-            scales = [2048, 1024, 512, 256, 128, 64, 32]
-            hop_sizes = []
-            win_lengths = []
-            overlap = 0.75
-            for s in scales:
-                hop_sizes.append(int(s * (1 - overlap)))
-                win_lengths.append(s)
-            sample_rate = model.sample_rate
-            stft_loss_args = {
-                "fft_sizes": scales,
-                "hop_sizes": hop_sizes,
-                "win_lengths": win_lengths,
-                "perceptual_weighting": True
-            }
-            out_channels = model.io_channels
-            if model.pretransform is not None:
-                out_channels = model.pretransform.io_channels
-            self.audio_out_channels = out_channels
-            if self.audio_out_channels == 2:
-                self.sdstft = auraloss.freq.SumAndDifferenceSTFTLoss(sample_rate=sample_rate, **stft_loss_args)
-                self.lrstft = auraloss.freq.MultiResolutionSTFTLoss(sample_rate=sample_rate, **stft_loss_args)
-                # Add left and right channel reconstruction losses in addition to the sum and difference
-                loss_modules += [
-                    AuralossLoss(self.lrstft, 'audio_reals_left', 'pred_left', name='stft_loss_left', weight=0.05),
-                    AuralossLoss(self.lrstft, 'audio_reals_right', 'pred_right', name='stft_loss_right', weight=0.05),
-                ]
-            else:
-                self.sdstft = auraloss.freq.MultiResolutionSTFTLoss(sample_rate=sample_rate, **stft_loss_args)
-            loss_modules.append(
-                AuralossLoss(self.sdstft, 'audio_reals', 'audio_pred', name='mrstft_loss', weight=0.1), # Reconstruction loss
-            )
-        self.losses = MultiLoss(loss_modules)
-        self.prior_type = prior_type
-    def configure_optimizers(self):
-        return optim.Adam([*self.diffusion.parameters()], lr=self.lr)
-    def training_step(self, batch, batch_idx):
-        reals, metadata = batch
-        if reals.ndim == 4 and reals.shape[0] == 1:
-            reals = reals[0]
-        loss_info = {}
-        loss_info["audio_reals"] = reals
-        if self.prior_type == PriorType.MonoToStereo:
-            source = reals.mean(dim=1, keepdim=True).repeat(1, reals.shape[1], 1).to(self.device)
-            loss_info["audio_reals_mono"] = source
-        else:
-            raise ValueError(f"Unknown prior type {self.prior_type}")
-        if self.diffusion.pretransform is not None:
-            with torch.no_grad():
-                reals = self.diffusion.pretransform.encode(reals)
-                if self.prior_type in [PriorType.MonoToStereo]:
-                    source = self.diffusion.pretransform.encode(source)
-        if self.diffusion.conditioner is not None:
-            with torch.amp.autocast('cuda'):
-                conditioning = self.diffusion.conditioner(metadata, self.device)
-        else:
-            conditioning = {}
-        loss_info["reals"] = reals
-        # Draw uniformly distributed continuous timesteps
-        t = self.rng.draw(reals.shape[0])[:, 0].to(self.device)
-        # Calculate the noise schedule parameters for those timesteps
-        alphas, sigmas = get_alphas_sigmas(t)
-        # Combine the ground truth data and the noise
-        alphas = alphas[:, None, None]
-        sigmas = sigmas[:, None, None]
-        noise = torch.randn_like(reals)
-        noised_reals = reals * alphas + noise * sigmas
-        targets = noise * alphas - reals * sigmas
-        with torch.amp.autocast('cuda'):
-            conditioning['source'] = [source]
-            v = self.diffusion(noised_reals, t, cond=conditioning, cfg_dropout_prob = 0.1)
-            loss_info.update({
-                "v": v,
-                "targets": targets
-            })
-            if self.use_reconstruction_loss:
-                pred = noised_reals * alphas - v * sigmas
-                loss_info["pred"] = pred
-                if self.diffusion.pretransform is not None:
-                    pred = self.diffusion.pretransform.decode(pred)
-                    loss_info["audio_pred"] = pred
-                if self.audio_out_channels == 2:
-                    loss_info["pred_left"] = pred[:, 0:1, :]
-                    loss_info["pred_right"] = pred[:, 1:2, :]
-                    loss_info["audio_reals_left"] = loss_info["audio_reals"][:, 0:1, :]
-                    loss_info["audio_reals_right"] = loss_info["audio_reals"][:, 1:2, :]
-            loss, losses = self.losses(loss_info)
-            if self.log_loss_info:
-                # Loss debugging logs
-                num_loss_buckets = 10
-                bucket_size = 1 / num_loss_buckets
-                loss_all = F.mse_loss(v, targets, reduction="none")
-                sigmas = rearrange(self.all_gather(sigmas), "w b c n -> (w b) c n").squeeze()
-                # gather loss_all across all GPUs
-                loss_all = rearrange(self.all_gather(loss_all), "w b c n -> (w b) c n")
-                # Bucket loss values based on corresponding sigma values, bucketing sigma values by bucket_size
-                loss_all = torch.stack([loss_all[(sigmas >= i) & (sigmas < i + bucket_size)].mean() for i in torch.arange(0, 1, bucket_size).to(self.device)])
-                # Log bucketed losses with corresponding sigma bucket values, if it's not NaN
-                debug_log_dict = {
-                    f"model/loss_all_{i/num_loss_buckets:.1f}": loss_all[i].detach() for i in range(num_loss_buckets) if not torch.isnan(loss_all[i])
-                }
-                self.log_dict(debug_log_dict)
-        log_dict = {
-            'train/loss': loss.detach(),
-            'train/std_data': reals.std()
-        }
-        for loss_name, loss_value in losses.items():
-            log_dict[f"train/{loss_name}"] = loss_value.detach()
-        self.log_dict(log_dict, prog_bar=True, on_step=True)
-        return loss
-    def on_before_zero_grad(self, *args, **kwargs):
-        self.diffusion_ema.update()
-    def export_model(self, path, use_safetensors=False):
-        #model = self.diffusion_ema.ema_model
-        model = self.diffusion
-        if use_safetensors:
-            save_file(model.state_dict(), path)
-        else:
-            torch.save({"state_dict": model.state_dict()}, path)
-class DiffusionPriorDemoCallback(Callback):
-    def __init__(
-        self,
-        demo_dl,
-        demo_every=2000,
-        demo_steps=250,
-        sample_size=65536,
-        sample_rate=48000
-    ):
-        super().__init__()
-        self.demo_every = demo_every
-        self.demo_steps = demo_steps
-        self.demo_samples = sample_size
-        self.demo_dl = iter(demo_dl)
-        self.sample_rate = sample_rate
-        self.last_demo_step = -1
-    @rank_zero_only
-    @torch.no_grad()
-    def on_train_batch_end(self, trainer, module: DiffusionAutoencoderTrainingWrapper, outputs, batch, batch_idx):
-        if (trainer.global_step - 1) % self.demo_every != 0 or self.last_demo_step == trainer.global_step:
-            return
-        self.last_demo_step = trainer.global_step
-        demo_reals, metadata = next(self.demo_dl)
-        # import ipdb
-        # ipdb.set_trace()
-        # Remove extra dimension added by WebDataset
-        if demo_reals.ndim == 4 and demo_reals.shape[0] == 1:
-            demo_reals = demo_reals[0]
-        demo_reals = demo_reals.to(module.device)
-        encoder_input = demo_reals
-        if module.diffusion.conditioner is not None:
-            with torch.amp.autocast('cuda'):
-                conditioning_tensors = module.diffusion.conditioner(metadata, module.device)
-        else:
-            conditioning_tensors = {}
-        with torch.no_grad() and torch.amp.autocast('cuda'):
-            if module.prior_type == PriorType.MonoToStereo and encoder_input.shape[1] > 1:
-                source = encoder_input.mean(dim=1, keepdim=True).repeat(1, encoder_input.shape[1], 1).to(module.device)
-            if module.diffusion.pretransform is not None:
-                encoder_input = module.diffusion.pretransform.encode(encoder_input)
-                source_input = module.diffusion.pretransform.encode(source)
-            else:
-                source_input = source
-            conditioning_tensors['source'] = [source_input]
-            fakes = sample(module.diffusion_ema.model, torch.randn_like(encoder_input), self.demo_steps, 0, cond=conditioning_tensors)
-            if module.diffusion.pretransform is not None:
-                fakes = module.diffusion.pretransform.decode(fakes)
-        #Interleave reals and fakes
-        reals_fakes = rearrange([demo_reals, fakes], 'i b d n -> (b i) d n')
-        # Put the demos together
-        reals_fakes = rearrange(reals_fakes, 'b d n -> d (b n)')
-        log_dict = {}
-        filename = f'recon_mono_{trainer.global_step:08}.wav'
-        reals_fakes = reals_fakes.to(torch.float32).div(torch.max(torch.abs(reals_fakes))).mul(32767).to(torch.int16).cpu()
-        torchaudio.save(filename, reals_fakes, self.sample_rate)
-        log_dict[f'recon'] = wandb.Audio(filename,
-                                            sample_rate=self.sample_rate,
-                                            caption=f'Reconstructed')
-        log_dict[f'recon_melspec_left'] = wandb.Image(audio_spectrogram_image(reals_fakes))
-        #Log the source
-        filename = f'source_{trainer.global_step:08}.wav'
-        source = rearrange(source, 'b d n -> d (b n)')
-        source = source.to(torch.float32).mul(32767).to(torch.int16).cpu()
-        torchaudio.save(filename, source, self.sample_rate)
-        log_dict[f'source'] = wandb.Audio(filename,
-                                            sample_rate=self.sample_rate,
-                                            caption=f'Source')
-        log_dict[f'source_melspec_left'] = wandb.Image(audio_spectrogram_image(source))
-        trainer.logger.experiment.log(log_dict)

 from ..inference.sampling import get_alphas_sigmas, sample, sample_discrete_euler
 from ..models.diffusion import DiffusionModelWrapper, ConditionedDiffusionModelWrapper
 from ..models.autoencoders import DiffusionAutoencoder
 from .autoencoders import create_loss_modules_from_bottleneck
 from .losses import AuralossLoss, MSELoss, MultiLoss
 from .utils import create_optimizer_from_config, create_scheduler_from_config, mask_from_frac_lengths, generate_mask, generate_channel_mask
     def predict_step(self, batch, batch_idx):
         reals, metadata = batch
         ids = [item['id'] for item in metadata]
         batch_size, length = reals.shape[0], reals.shape[2]
+        print(f"Predicting {batch_size} samples with length {length} for ids: {ids}")
         with torch.amp.autocast('cuda'):
             conditioning = self.diffusion.conditioner(metadata, self.device)
                 end_time = time.time()
                 execution_time = end_time - start_time
                 print(f"执行时间: {execution_time:.2f} 秒")
             if self.diffusion.pretransform is not None:
                 fakes = self.diffusion.pretransform.decode(fakes)
             gc.collect()
             torch.cuda.empty_cache()
             module.train()

{think_sound → ThinkSound}/training/factory.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/training/losses/__init__.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/training/losses/auraloss.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/training/losses/losses.py RENAMED Viewed

File without changes

{think_sound → ThinkSound}/training/utils.py RENAMED Viewed

File without changes

app.py CHANGED Viewed

@@ -14,13 +14,12 @@ from lightning.pytorch.tuner import Tuner
 from lightning.pytorch import seed_everything
 import random
 from datetime import datetime
-# from think_sound.data.dataset import create_dataloader_from_config
-from think_sound.data.datamodule import DataModule
-from think_sound.models import create_model_from_config
-from think_sound.models.utils import load_ckpt_state_dict, remove_weight_norm_from_model
-from think_sound.training import create_training_wrapper_from_config, create_demo_callback_from_config
-from think_sound.training.utils import copy_state_dict
-from think_sound.inference.sampling import get_alphas_sigmas, sample, sample_discrete_euler
 from data_utils.v2a_utils.feature_utils_224 import FeaturesUtils
 from torch.utils.data import Dataset
 from typing import Optional, Union
@@ -34,7 +33,7 @@ import tempfile
 import subprocess
 from huggingface_hub import hf_hub_download
 from moviepy.editor import VideoFileClip
-os.system("conda install -c conda-forge 'ffmpeg<7'")
 _CLIP_SIZE = 224
 _CLIP_FPS = 8.0
@@ -101,7 +100,7 @@ class VGGSound(Dataset):
         self.resampler = {}
-    def sample(self, video_path,label):
         video_id = video_path
         reader = StreamingMediaDecoder(video_path)
@@ -156,7 +155,7 @@ class VGGSound(Dataset):
             # padding using the last frame, but no more than 2
             current_length = sync_chunk.shape[0]
             last_frame = sync_chunk[-1]
-            # 重复最后一帧以进行填充
             padding = last_frame.repeat(self.sync_expected_length - current_length, 1, 1, 1)
             assert self.sync_expected_length - current_length < 12, f'sync can pad no more than 2 while {self.sync_expected_length - current_length}'
             sync_chunk = torch.cat((sync_chunk, padding), dim=0)
@@ -170,6 +169,7 @@ class VGGSound(Dataset):
         data = {
             'id': video_id,
             'caption': label,
             # 'audio': audio_chunk,
             'clip_video': clip_chunk,
             'sync_video': sync_chunk,
@@ -187,17 +187,16 @@ else:
 print(f"load in device {device}")
-vae_ckpt = hf_hub_download(repo_id="liuhuadai/ThinkSound", filename="vae.ckpt",repo_type="model")
-synchformer_ckpt = hf_hub_download(repo_id="liuhuadai/ThinkSound", filename="synchformer_state_dict.pth",repo_type="model")
 feature_extractor = FeaturesUtils(
-    vae_ckpt=vae_ckpt,
-    vae_config='think_sound/configs/model_configs/autoencoders/stable_audio_2_0_vae.json',
     enable_conditions=True,
     synchformer_ckpt=synchformer_ckpt
 ).eval().to(extra_device)
 args = get_all_args()
 seed = 10086
@@ -206,7 +205,7 @@ seed_everything(seed, workers=True)
 #Get JSON config from args.model_config
-with open("think_sound/configs/model_configs/vt2audio/latent_clip_224_text_sync_mmdit_flow_logit_t5_kernel_size3.json") as f:
     model_config = json.load(f)
 model = create_model_from_config(model_config)
@@ -229,7 +228,7 @@ model.pretransform.load_state_dict(load_vae_state)
 # Remove weight_norm from the pretransform if specified
 if args.remove_pretransform_weight_norm == "post_load":
     remove_weight_norm_from_model(model.pretransform)
-ckpt_path = hf_hub_download(repo_id="liuhuadai/ThinkSound", filename="thinksound.ckpt",repo_type="model")
 training_wrapper = create_training_wrapper_from_config(model_config, model)
 # 加载模型权重时根据设备选择map_location
 training_wrapper.load_state_dict(torch.load(ckpt_path)['state_dict'])
@@ -243,16 +242,17 @@ def get_video_duration(video_path):
 @spaces.GPU(duration=60)
 @torch.inference_mode()
 @torch.no_grad()
-def get_audio(video_path, caption):
-    # 允许caption为空
     if caption is None:
         caption = ''
     timer = Timer(duration="00:15:00:00")
     #get video duration
     duration_sec = get_video_duration(video_path)
     print(duration_sec)
     preprocesser = VGGSound(duration_sec=duration_sec)
-    data = preprocesser.sample(video_path, caption)
@@ -261,7 +261,7 @@ def get_audio(video_path, caption):
     preprocessed_data['metaclip_global_text_features'] = metaclip_global_text_features.detach().cpu().squeeze(0)
     preprocessed_data['metaclip_text_features'] = metaclip_text_features.detach().cpu().squeeze(0)
-    t5_features = feature_extractor.encode_t5_text(data['caption'])
     preprocessed_data['t5_features'] = t5_features.detach().cpu().squeeze(0)
     clip_features = feature_extractor.encode_video_with_clip(data['clip_video'].unsqueeze(0).to(extra_device))
@@ -305,56 +305,47 @@ def get_audio(video_path, caption):
             fakes = training_wrapper.diffusion.pretransform.decode(fakes)
     audios = fakes.to(torch.float32).div(torch.max(torch.abs(fakes))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
-    # 保存临时音频文件
     with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_audio:
         torchaudio.save(tmp_audio.name, audios[0], 44100)
         audio_path = tmp_audio.name
     return audio_path
-def synthesize_video_with_audio(video_file, caption):
-    # 允许caption为空
-    if caption is None:
-        caption = ''
-    audio_path = get_audio(video_file, caption)
     with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_video:
         output_video_path = tmp_video.name
-    # ffmpeg命令：用新音频替换原视频音轨
     cmd = [
         'ffmpeg', '-y', '-i', video_file, '-i', audio_path,
         '-c:v', 'copy', '-map', '0:v:0', '-map', '1:a:0',
         '-shortest', output_video_path
     ]
     subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     return output_video_path
-# Gradio界面
-with gr.Blocks() as demo:
-    gr.Markdown(
-        """
-# ThinkSound\n
-ThinkSound is a unified Any2Audio generation framework with flow matching guided by Chain-of-Thought (CoT) reasoning.
-Upload video and caption (optional), and get video with audio!
-"""
-    )
-    with gr.Row():
-        video_input = gr.Video(label="upload video")
-        caption_input = gr.Textbox(label="caption(optional)", placeholder="can be empty", lines=1)
-    output_video = gr.Video(label="output video")
-    btn = gr.Button("start synthesize")
-    btn.click(fn=synthesize_video_with_audio, inputs=[video_input, caption_input], outputs=output_video)
-    gr.Examples(
-        examples=[
-            ["./examples/1_mute.mp4", "Playing Trumpet", "./examples/1.mp4"],
-            ["./examples/2_mute.mp4", "Axe striking", "./examples/2.mp4"],
-            ["./examples/3_mute.mp4", "Gentle Sucking Sounds From the Pacifier", "./examples/3.mp4"],
-            ["./examples/4_mute.mp4", "train passing by", "./examples/4.mp4"],
-            ["./examples/5_mute.mp4", "Lighting Firecrackers", "./examples/5.mp4"]
-        ],
-        inputs=[video_input, caption_input,output_video],
-    )
-demo.launch(share=True)

 from lightning.pytorch import seed_everything
 import random
 from datetime import datetime
+from ThinkSound.data.datamodule import DataModule
+from ThinkSound.models import create_model_from_config
+from ThinkSound.models.utils import load_ckpt_state_dict, remove_weight_norm_from_model
+from ThinkSound.training import create_training_wrapper_from_config, create_demo_callback_from_config
+from ThinkSound.training.utils import copy_state_dict
+from ThinkSound.inference.sampling import get_alphas_sigmas, sample, sample_discrete_euler
 from data_utils.v2a_utils.feature_utils_224 import FeaturesUtils
 from torch.utils.data import Dataset
 from typing import Optional, Union
 import subprocess
 from huggingface_hub import hf_hub_download
 from moviepy.editor import VideoFileClip
+# os.system("conda install -c conda-forge 'ffmpeg<7'")
 _CLIP_SIZE = 224
 _CLIP_FPS = 8.0
         self.resampler = {}
+    def sample(self, video_path,label,cot):
         video_id = video_path
         reader = StreamingMediaDecoder(video_path)
             # padding using the last frame, but no more than 2
             current_length = sync_chunk.shape[0]
             last_frame = sync_chunk[-1]
             padding = last_frame.repeat(self.sync_expected_length - current_length, 1, 1, 1)
             assert self.sync_expected_length - current_length < 12, f'sync can pad no more than 2 while {self.sync_expected_length - current_length}'
             sync_chunk = torch.cat((sync_chunk, padding), dim=0)
         data = {
             'id': video_id,
             'caption': label,
+            'caption_cot': cot,
             # 'audio': audio_chunk,
             'clip_video': clip_chunk,
             'sync_video': sync_chunk,
 print(f"load in device {device}")
+vae_ckpt = hf_hub_download(repo_id="FunAudioLLM/ThinkSound", filename="vae.ckpt",repo_type="model")
+synchformer_ckpt = hf_hub_download(repo_id="FunAudioLLM/ThinkSound", filename="synchformer_state_dict.pth",repo_type="model")
 feature_extractor = FeaturesUtils(
+    vae_ckpt=None,
+    vae_config='ThinkSound/configs/model_configs/stable_audio_2_0_vae.json',
     enable_conditions=True,
     synchformer_ckpt=synchformer_ckpt
 ).eval().to(extra_device)
 args = get_all_args()
 seed = 10086
 #Get JSON config from args.model_config
+with open("ThinkSound/configs/model_configs/thinksound.json") as f:
     model_config = json.load(f)
 model = create_model_from_config(model_config)
 # Remove weight_norm from the pretransform if specified
 if args.remove_pretransform_weight_norm == "post_load":
     remove_weight_norm_from_model(model.pretransform)
+ckpt_path = hf_hub_download(repo_id="FunAudioLLM/ThinkSound", filename="thinksound.ckpt",repo_type="model")
 training_wrapper = create_training_wrapper_from_config(model_config, model)
 # 加载模型权重时根据设备选择map_location
 training_wrapper.load_state_dict(torch.load(ckpt_path)['state_dict'])
 @spaces.GPU(duration=60)
 @torch.inference_mode()
 @torch.no_grad()
+def get_audio(video_path, caption, cot):
     if caption is None:
         caption = ''
+    if cot is None:
+        cot = caption
     timer = Timer(duration="00:15:00:00")
     #get video duration
     duration_sec = get_video_duration(video_path)
     print(duration_sec)
     preprocesser = VGGSound(duration_sec=duration_sec)
+    data = preprocesser.sample(video_path, caption, cot)
     preprocessed_data['metaclip_global_text_features'] = metaclip_global_text_features.detach().cpu().squeeze(0)
     preprocessed_data['metaclip_text_features'] = metaclip_text_features.detach().cpu().squeeze(0)
+    t5_features = feature_extractor.encode_t5_text(data['caption_cot'])
     preprocessed_data['t5_features'] = t5_features.detach().cpu().squeeze(0)
     clip_features = feature_extractor.encode_video_with_clip(data['clip_video'].unsqueeze(0).to(extra_device))
             fakes = training_wrapper.diffusion.pretransform.decode(fakes)
     audios = fakes.to(torch.float32).div(torch.max(torch.abs(fakes))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
     with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_audio:
         torchaudio.save(tmp_audio.name, audios[0], 44100)
         audio_path = tmp_audio.name
     return audio_path
+def synthesize_video_with_audio(video_file, caption, cot):
+    audio_path = get_audio(video_file, caption, cot)
     with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_video:
         output_video_path = tmp_video.name
     cmd = [
         'ffmpeg', '-y', '-i', video_file, '-i', audio_path,
         '-c:v', 'copy', '-map', '0:v:0', '-map', '1:a:0',
         '-shortest', output_video_path
     ]
     subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     return output_video_path
+demo = gr.Interface(
+    fn=synthesize_video_with_audio,
+    inputs=[
+        gr.Video(label="Upload Video"),
+        gr.Textbox(label="Caption (optional)", placeholder="can be empty",),
+        gr.Textbox(label="CoT Description (optional)", lines=6, placeholder="can be empty",),
+    ],
+    outputs=[
+        gr.Video(label="Result"),
+    ],
+    title="ThinkSound Demo",
+    description="Upload a video, caption, or CoT to generate audio. For an enhanced experience, we automatically merge the generated audio with your original silent video. (Note: Flexible audio generation lengths are supported.:)",
+    examples=[
+        ["examples/3_mute.mp4", "Gentle Sucking Sounds From the Pacifier", "Begin by creating a soft, steady background of light pacifier suckling. Add subtle, breathy rhythms to mimic a newborn's gentle mouth movements. Keep the sound smooth, natural, and soothing."],
+        ["examples/2_mute.mp4", "Printer Printing", "Generate a continuous printer printing sound with periodic beeps and paper movement, plus a cat pawing at the machine. Add subtle ambient room noise for authenticity, keeping the focus on printing, beeps, and the cat's interaction."],
+        ["examples/4_mute.mp4", "Plastic Debris Handling", "Begin with the sound of hands scooping up loose plastic debris, followed by the subtle cascading noise as the pieces fall and scatter back down. Include soft crinkling and rustling to emphasize the texture of the plastic. Add ambient factory background noise with distant machinery to create an industrial atmosphere."],
+        ["examples/5_mute.mp4", "Lighting Firecrackers", "Generate the sound of firecrackers lighting and exploding repeatedly on the ground, followed by fireworks bursting in the sky. Incorporate occasional subtle echoes to mimic an outdoor night ambiance, with no human voices present."]
+    ],
+    cache_examples=True
+)
+if __name__ == "__main__":
+    demo.launch(share=True)

cot_vgg_demo_caption.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ demo.npz

data_utils/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (149 Bytes)

data_utils/__pycache__/utils.cpython-310.pyc DELETED Viewed

Binary file (4.56 kB)

data_utils/__pycache__/utils.cpython-39.pyc DELETED Viewed

Binary file (4.56 kB)

data_utils/ext/synchformer/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (243 Bytes)

data_utils/ext/synchformer/__pycache__/__init__.cpython-39.pyc DELETED Viewed

Binary file (241 Bytes)

data_utils/ext/synchformer/__pycache__/motionformer.cpython-310.pyc DELETED Viewed

Binary file (12.7 kB)

data_utils/ext/synchformer/__pycache__/motionformer.cpython-39.pyc DELETED Viewed

Binary file (12.7 kB)

data_utils/ext/synchformer/__pycache__/synchformer.cpython-310.pyc DELETED Viewed

Binary file (1.91 kB)

data_utils/ext/synchformer/__pycache__/synchformer.cpython-39.pyc DELETED Viewed

Binary file (1.9 kB)

data_utils/ext/synchformer/__pycache__/utils.cpython-310.pyc DELETED Viewed

Binary file (3.97 kB)

data_utils/ext/synchformer/__pycache__/utils.cpython-39.pyc DELETED Viewed

Binary file (3.78 kB)