Spaces:

tencent
/

HunyuanVideo-Foley

Running on Zero

HunyuanVideo-Foley / hunyuanvideo_foley /models /synchformer /synchformer.py

James Zhou

[init]

9867d34 4 days ago

12.9 kB

	import logging
	import math
	from typing import Any, Mapping

	import einops
	import numpy as np
	import torch
	import torchaudio
	from torch import nn
	from torch.nn import functional as F

	from .motionformer import MotionFormer
	from .ast_model import AST
	from .utils import Config


	class Synchformer(nn.Module):

	def __init__(self):
	super().__init__()

	self.vfeat_extractor = MotionFormer(
	extract_features=True,
	factorize_space_time=True,
	agg_space_module="TransformerEncoderLayer",
	agg_time_module="torch.nn.Identity",
	add_global_repr=False,
	)
	self.afeat_extractor = AST(
	extract_features=True,
	max_spec_t=66,
	factorize_freq_time=True,
	agg_freq_module="TransformerEncoderLayer",
	agg_time_module="torch.nn.Identity",
	add_global_repr=False,
	)

	# # bridging the s3d latent dim (1024) into what is specified in the config
	# # to match e.g. the transformer dim
	self.vproj = nn.Linear(in_features=768, out_features=768)
	self.aproj = nn.Linear(in_features=768, out_features=768)
	self.transformer = GlobalTransformer(
	tok_pdrop=0.0, embd_pdrop=0.1, resid_pdrop=0.1, attn_pdrop=0.1, n_layer=3, n_head=8, n_embd=768
	)

	def forward(self, vis):
	B, S, Tv, C, H, W = vis.shape
	vis = vis.permute(0, 1, 3, 2, 4, 5) # (B, S, C, Tv, H, W)
	# feat extractors return a tuple of segment-level and global features (ignored for sync)
	# (B, S, tv, D), e.g. (B, 7, 8, 768)
	vis = self.vfeat_extractor(vis)
	return vis

	def compare_v_a(self, vis: torch.Tensor, aud: torch.Tensor):
	vis = self.vproj(vis)
	aud = self.aproj(aud)

	B, S, tv, D = vis.shape
	B, S, ta, D = aud.shape
	vis = vis.view(B, S * tv, D) # (B, S*tv, D)
	aud = aud.view(B, S * ta, D) # (B, S*ta, D)
	# print(vis.shape, aud.shape)

	# self.transformer will concatenate the vis and aud in one sequence with aux tokens,
	# ie `CvvvvMaaaaaa`, and will return the logits for the CLS tokens
	logits = self.transformer(vis, aud) # (B, cls); or (B, cls) and (B, 2) if DoubtingTransformer

	return logits

	def extract_vfeats(self, vis):
	B, S, Tv, C, H, W = vis.shape
	vis = vis.permute(0, 1, 3, 2, 4, 5) # (B, S, C, Tv, H, W)
	# feat extractors return a tuple of segment-level and global features (ignored for sync)
	# (B, S, tv, D), e.g. (B, 7, 8, 768)
	vis = self.vfeat_extractor(vis)
	return vis

	def extract_afeats(self, aud):
	B, S, _, Fa, Ta = aud.shape
	aud = aud.view(B, S, Fa, Ta).permute(0, 1, 3, 2) # (B, S, Ta, F)
	# (B, S, ta, D), e.g. (B, 7, 6, 768)
	aud, _ = self.afeat_extractor(aud)
	return aud

	def compute_loss(self, logits, targets, loss_fn: str = None):
	loss = None
	if targets is not None:
	if loss_fn is None or loss_fn == "cross_entropy":
	# logits: (B, cls) and targets: (B,)
	loss = F.cross_entropy(logits, targets)
	else:
	raise NotImplementedError(f"Loss {loss_fn} not implemented")
	return loss

	def load_state_dict(self, sd: Mapping[str, Any], strict: bool = True):
	# discard all entries except vfeat_extractor
	# sd = {k: v for k, v in sd.items() if k.startswith('vfeat_extractor')}

	return super().load_state_dict(sd, strict)


	class RandInitPositionalEncoding(nn.Module):
	"""Random inited trainable pos embedding. It is just applied on the sequence, thus respects no priors."""

	def __init__(self, block_shape: list, n_embd: int):
	super().__init__()
	self.block_shape = block_shape
	self.n_embd = n_embd
	self.pos_emb = nn.Parameter(torch.randn(1, *block_shape, n_embd))

	def forward(self, token_embeddings):
	return token_embeddings + self.pos_emb


	class GlobalTransformer(torch.nn.Module):
	"""Same as in SparseSync but without the selector transformers and the head"""

	def __init__(
	self,
	tok_pdrop=0.0,
	embd_pdrop=0.1,
	resid_pdrop=0.1,
	attn_pdrop=0.1,
	n_layer=3,
	n_head=8,
	n_embd=768,
	pos_emb_block_shape=[
	198,
	],
	n_off_head_out=21,
	) -> None:
	super().__init__()
	self.config = Config(
	embd_pdrop=embd_pdrop,
	resid_pdrop=resid_pdrop,
	attn_pdrop=attn_pdrop,
	n_layer=n_layer,
	n_head=n_head,
	n_embd=n_embd,
	)
	# input norm
	self.vis_in_lnorm = torch.nn.LayerNorm(n_embd)
	self.aud_in_lnorm = torch.nn.LayerNorm(n_embd)
	# aux tokens
	self.OFF_tok = torch.nn.Parameter(torch.randn(1, 1, n_embd))
	self.MOD_tok = torch.nn.Parameter(torch.randn(1, 1, n_embd))
	# whole token dropout
	self.tok_pdrop = tok_pdrop
	self.tok_drop_vis = torch.nn.Dropout1d(tok_pdrop)
	self.tok_drop_aud = torch.nn.Dropout1d(tok_pdrop)
	# maybe add pos emb
	self.pos_emb_cfg = RandInitPositionalEncoding(
	block_shape=pos_emb_block_shape,
	n_embd=n_embd,
	)
	# the stem
	self.drop = torch.nn.Dropout(embd_pdrop)
	self.blocks = torch.nn.Sequential(*[Block(self.config) for _ in range(n_layer)])
	# pre-output norm
	self.ln_f = torch.nn.LayerNorm(n_embd)
	# maybe add a head
	self.off_head = torch.nn.Linear(in_features=n_embd, out_features=n_off_head_out)

	def forward(self, v: torch.Tensor, a: torch.Tensor, targets=None, attempt_to_apply_heads=True):
	B, Sv, D = v.shape
	B, Sa, D = a.shape
	# broadcasting special tokens to the batch size
	off_tok = einops.repeat(self.OFF_tok, "1 1 d -> b 1 d", b=B)
	mod_tok = einops.repeat(self.MOD_tok, "1 1 d -> b 1 d", b=B)
	# norm
	v, a = self.vis_in_lnorm(v), self.aud_in_lnorm(a)
	# maybe whole token dropout
	if self.tok_pdrop > 0:
	v, a = self.tok_drop_vis(v), self.tok_drop_aud(a)
	# (B, 1+Sv+1+Sa, D)
	x = torch.cat((off_tok, v, mod_tok, a), dim=1)
	# maybe add pos emb
	if hasattr(self, "pos_emb_cfg"):
	x = self.pos_emb_cfg(x)
	# dropout -> stem -> norm
	x = self.drop(x)
	x = self.blocks(x)
	x = self.ln_f(x)
	# maybe add heads
	if attempt_to_apply_heads and hasattr(self, "off_head"):
	x = self.off_head(x[:, 0, :])
	return x


	class SelfAttention(nn.Module):
	"""
	A vanilla multi-head masked self-attention layer with a projection at the end.
	It is possible to use torch.nn.MultiheadAttention here but I am including an
	explicit implementation here to show that there is nothing too scary here.
	"""

	def __init__(self, config):
	super().__init__()
	assert config.n_embd % config.n_head == 0
	# key, query, value projections for all heads
	self.key = nn.Linear(config.n_embd, config.n_embd)
	self.query = nn.Linear(config.n_embd, config.n_embd)
	self.value = nn.Linear(config.n_embd, config.n_embd)
	# regularization
	self.attn_drop = nn.Dropout(config.attn_pdrop)
	self.resid_drop = nn.Dropout(config.resid_pdrop)
	# output projection
	self.proj = nn.Linear(config.n_embd, config.n_embd)
	# # causal mask to ensure that attention is only applied to the left in the input sequence
	# mask = torch.tril(torch.ones(config.block_size,
	# config.block_size))
	# if hasattr(config, "n_unmasked"):
	# mask[:config.n_unmasked, :config.n_unmasked] = 1
	# self.register_buffer("mask", mask.view(1, 1, config.block_size, config.block_size))
	self.n_head = config.n_head

	def forward(self, x):
	B, T, C = x.size()

	# calculate query, key, values for all heads in batch and move head forward to be the batch dim
	k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
	q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
	v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

	# self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
	att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
	# att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf'))
	att = F.softmax(att, dim=-1)
	y = self.attn_drop(att) @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
	y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

	# output projection
	y = self.resid_drop(self.proj(y))

	return y


	class Block(nn.Module):
	"""an unassuming Transformer block"""

	def __init__(self, config):
	super().__init__()
	self.ln1 = nn.LayerNorm(config.n_embd)
	self.ln2 = nn.LayerNorm(config.n_embd)
	self.attn = SelfAttention(config)
	self.mlp = nn.Sequential(
	nn.Linear(config.n_embd, 4 * config.n_embd),
	nn.GELU(), # nice
	nn.Linear(4 * config.n_embd, config.n_embd),
	nn.Dropout(config.resid_pdrop),
	)

	def forward(self, x):
	x = x + self.attn(self.ln1(x))
	x = x + self.mlp(self.ln2(x))
	return x


	def make_class_grid(
	leftmost_val,
	rightmost_val,
	grid_size,
	add_extreme_offset: bool = False,
	seg_size_vframes: int = None,
	nseg: int = None,
	step_size_seg: float = None,
	vfps: float = None,
	):
	assert grid_size >= 3, f"grid_size: {grid_size} doesnot make sense. If =2 -> (-1,1); =1 -> (-1); =0 -> ()"
	grid = torch.from_numpy(np.linspace(leftmost_val, rightmost_val, grid_size)).float()
	if add_extreme_offset:
	assert all([seg_size_vframes, nseg, step_size_seg]), f"{seg_size_vframes} {nseg} {step_size_seg}"
	seg_size_sec = seg_size_vframes / vfps
	trim_size_in_seg = nseg - (1 - step_size_seg) * (nseg - 1)
	extreme_value = trim_size_in_seg * seg_size_sec
	grid = torch.cat([grid, torch.tensor([extreme_value])]) # adding extreme offset to the class grid
	return grid


	# from synchformer
	def pad_or_truncate(audio: torch.Tensor, max_spec_t: int, pad_mode: str = "constant", pad_value: float = 0.0):
	difference = max_spec_t - audio.shape[-1] # safe for batched input
	# pad or truncate, depending on difference
	if difference > 0:
	# pad the last dim (time) -> (..., n_mels, 0+time+difference) # safe for batched input
	pad_dims = (0, difference)
	audio = torch.nn.functional.pad(audio, pad_dims, pad_mode, pad_value)
	elif difference < 0:
	print(f"Truncating spec ({audio.shape}) to max_spec_t ({max_spec_t}).")
	audio = audio[..., :max_spec_t] # safe for batched input
	return audio


	def encode_audio_with_sync(
	synchformer: Synchformer, x: torch.Tensor, mel: torchaudio.transforms.MelSpectrogram
	) -> torch.Tensor:
	b, t = x.shape

	# partition the video
	segment_size = 10240
	step_size = 10240 // 2
	num_segments = (t - segment_size) // step_size + 1
	segments = []
	for i in range(num_segments):
	segments.append(x[:, i * step_size : i * step_size + segment_size])
	x = torch.stack(segments, dim=1) # (B, S, T, C, H, W)

	x = mel(x)
	x = torch.log(x + 1e-6)
	x = pad_or_truncate(x, 66)

	mean = -4.2677393
	std = 4.5689974
	x = (x - mean) / (2 * std)
	# x: B * S * 128 * 66
	x = synchformer.extract_afeats(x.unsqueeze(2))
	return x


	def read_audio(filename, expected_length=int(16000 * 4)):
	waveform, sr = torchaudio.load(filename)
	waveform = waveform.mean(dim=0)

	if sr != 16000:
	resampler = torchaudio.transforms.Resample(sr, 16000)
	waveform = resampler[sr](waveform)

	waveform = waveform[:expected_length]
	if waveform.shape[0] != expected_length:
	raise ValueError(f"Audio {filename} is too short")

	waveform = waveform.squeeze()

	return waveform


	if __name__ == "__main__":
	synchformer = Synchformer().cuda().eval()

	# mmaudio provided synchformer ckpt
	synchformer.load_state_dict(
	torch.load(
	os.environ.get("SYNCHFORMER_WEIGHTS", f"weights/synchformer.pth"),
	weights_only=True,
	map_location="cpu",
	)
	)

	sync_mel_spectrogram = torchaudio.transforms.MelSpectrogram(
	sample_rate=16000,
	win_length=400,
	hop_length=160,
	n_fft=1024,
	n_mels=128,
	)