DenseAV-Lowell

Running

App Files Files Community

DenseAV-Lowell / DenseAV /denseav /featurizers /AudioMAE.py

lorocksUMD

Upload 32 files

e6d4b46 verified 2 months ago

raw

history blame contribute delete

21.2 kB

	import math
	import os
	import warnings
	from functools import partial

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torchaudio
	from timm.models.layers import to_2tuple
	from torch.utils.data import Dataset
	from torchaudio.functional import resample
	import pickle


	def _no_grad_trunc_normal_(tensor, mean, std, a, b):
	# Cut & paste from PyTorch official master until it's in a few official releases - RW
	# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
	def norm_cdf(x):
	# Computes standard normal cumulative distribution function
	return (1. + math.erf(x / math.sqrt(2.))) / 2.

	if (mean < a - 2 * std) or (mean > b + 2 * std):
	warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
	"The distribution of values may be incorrect.",
	stacklevel=2)

	with torch.no_grad():
	# Values are generated by using a truncated uniform distribution and
	# then using the inverse CDF for the normal distribution.
	# Get upper and lower cdf values
	l = norm_cdf((a - mean) / std)
	u = norm_cdf((b - mean) / std)

	# Uniformly fill tensor with values from [l, u], then translate to
	# [2l-1, 2u-1].
	tensor.uniform_(2 * l - 1, 2 * u - 1)

	# Use inverse cdf transform for normal distribution to get truncated
	# standard normal
	tensor.erfinv_()

	# Transform to proper mean, std
	tensor.mul_(std * math.sqrt(2.))
	tensor.add_(mean)

	# Clamp to ensure it's in the proper range
	tensor.clamp_(min=a, max=b)
	return tensor


	def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
	# type: (Tensor, float, float, float, float) -> Tensor
	r"""Fills the input Tensor with values drawn from a truncated
	normal distribution. The values are effectively drawn from the
	normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
	with values outside :math:`[a, b]` redrawn until they are within
	the bounds. The method used for generating the random values works
	best when :math:`a \leq \text{mean} \leq b`.
	Args:
	tensor: an n-dimensional `torch.Tensor`
	mean: the mean of the normal distribution
	std: the standard deviation of the normal distribution
	a: the minimum cutoff value
	b: the maximum cutoff value
	Examples:
	>>> w = torch.empty(3, 5)
	>>> nn.init.trunc_normal_(w)
	"""
	return _no_grad_trunc_normal_(tensor, mean, std, a, b)


	class Mlp(nn.Module):
	def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
	super().__init__()
	out_features = out_features or in_features
	hidden_features = hidden_features or in_features
	self.fc1 = nn.Linear(in_features, hidden_features)
	self.act = act_layer()
	self.fc2 = nn.Linear(hidden_features, out_features)
	self.drop = nn.Dropout(drop)

	def forward(self, x):
	x = self.fc1(x)
	x = self.act(x)
	x = self.drop(x)
	x = self.fc2(x)
	x = self.drop(x)
	return x


	class Attention(nn.Module):
	def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
	super().__init__()
	self.num_heads = num_heads
	head_dim = dim // num_heads
	# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
	self.scale = qk_scale or head_dim ** -0.5

	self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)

	def forward(self, x):
	B, N, C = x.shape
	qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
	q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)

	attn = (q @ k.transpose(-2, -1)) * self.scale
	attn = attn.softmax(dim=-1)
	attn = self.attn_drop(attn)

	x = (attn @ v).transpose(1, 2).reshape(B, N, C)
	x = self.proj(x)
	x = self.proj_drop(x)
	return x


	def drop_path(x, drop_prob: float = 0., training: bool = False):
	"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

	This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
	the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
	See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
	changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
	'survival rate' as the argument.

	"""
	if drop_prob == 0. or not training:
	return x
	keep_prob = 1 - drop_prob
	shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
	random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
	random_tensor.floor_() # binarize
	output = x.div(keep_prob) * random_tensor
	return output


	class DropPath(nn.Module):
	"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
	"""

	def __init__(self, drop_prob=None):
	super(DropPath, self).__init__()
	self.drop_prob = drop_prob

	def forward(self, x):
	return drop_path(x, self.drop_prob, self.training)


	class Block(nn.Module):

	def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
	drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
	super().__init__()
	self.norm1 = norm_layer(dim)
	self.attn = Attention(
	dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
	# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
	self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
	self.norm2 = norm_layer(dim)
	mlp_hidden_dim = int(dim * mlp_ratio)
	self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

	def forward(self, x):
	x = x + self.drop_path(self.attn(self.norm1(x)))
	x = x + self.drop_path(self.mlp(self.norm2(x)))
	return x


	class PatchEmbed(nn.Module):
	""" Image to Patch Embedding
	"""

	def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
	super().__init__()
	img_size = to_2tuple(img_size)
	patch_size = to_2tuple(patch_size)
	num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
	self.patch_hw = (img_size[1] // patch_size[1], img_size[0] // patch_size[0])
	self.img_size = img_size
	self.patch_size = patch_size
	self.num_patches = num_patches

	self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

	def forward(self, x):
	B, C, H, W = x.shape
	# FIXME look at relaxing size constraints
	# assert H == self.img_size[0] and W == self.img_size[1], \
	# f"Input image size ({H}{W}) doesn't match model ({self.img_size[0]}{self.img_size[1]})."
	x = self.proj(x).flatten(2).transpose(1, 2)
	return x


	class HybridEmbed(nn.Module):
	""" CNN Feature Map Embedding
	Extract feature map from CNN, flatten, project to embedding dim.
	"""

	def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768):
	super().__init__()
	assert isinstance(backbone, nn.Module)
	img_size = to_2tuple(img_size)
	self.img_size = img_size
	self.backbone = backbone
	if feature_size is None:
	with torch.no_grad():
	# FIXME this is hacky, but most reliable way of determining the exact dim of the output feature
	# map for all networks, the feature metadata has reliable channel and stride info, but using
	# stride to calc feature dim requires info about padding of each stage that isn't captured.
	training = backbone.training
	if training:
	backbone.eval()
	o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
	feature_size = o.shape[-2:]
	feature_dim = o.shape[1]
	backbone.train(training)
	else:
	feature_size = to_2tuple(feature_size)
	feature_dim = self.backbone.feature_info.channels()[-1]
	self.num_patches = feature_size[0] * feature_size[1]
	self.proj = nn.Linear(feature_dim, embed_dim)

	def forward(self, x):
	x = self.backbone(x)[-1]
	x = x.flatten(2).transpose(1, 2)
	x = self.proj(x)
	return x


	class TimmVisionTransformer(nn.Module):
	""" Vision Transformer with support for patch or hybrid CNN input stage
	"""

	def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
	num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
	drop_path_rate=0., hybrid_backbone=None, norm_layer=nn.LayerNorm):
	super().__init__()
	self.num_classes = num_classes
	self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models

	if hybrid_backbone is not None:
	self.patch_embed = HybridEmbed(
	hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
	else:
	self.patch_embed = PatchEmbed(
	img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
	num_patches = self.patch_embed.num_patches

	self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
	self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
	self.pos_drop = nn.Dropout(p=drop_rate)

	dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
	self.blocks = nn.ModuleList([
	Block(
	dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
	drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
	for i in range(depth)])
	self.norm = norm_layer(embed_dim)

	# NOTE as per official impl, we could have a pre-logits representation dense layer + tanh here
	# self.repr = nn.Linear(embed_dim, representation_size)
	# self.repr_act = nn.Tanh()

	# Classifier head
	self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()

	trunc_normal_(self.pos_embed, std=.02)
	trunc_normal_(self.cls_token, std=.02)
	self.apply(self._init_weights)

	def _init_weights(self, m):
	if isinstance(m, nn.Linear):
	trunc_normal_(m.weight, std=.02)
	if isinstance(m, nn.Linear) and m.bias is not None:
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.LayerNorm):
	nn.init.constant_(m.bias, 0)
	nn.init.constant_(m.weight, 1.0)

	@torch.jit.ignore
	def no_weight_decay(self):
	return {'pos_embed', 'cls_token'}

	def get_classifier(self):
	return self.head

	def reset_classifier(self, num_classes, global_pool=''):
	self.num_classes = num_classes
	self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()

	def forward_features(self, x):
	B = x.shape[0]
	x = self.patch_embed(x)

	cls_tokens = self.cls_token.expand(B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
	x = torch.cat((cls_tokens, x), dim=1)
	x = x + self.pos_embed
	x = self.pos_drop(x)

	for blk in self.blocks:
	x = blk(x)

	x = self.norm(x)
	return x[:, 0]

	def forward(self, x):
	x = self.forward_features(x)
	x = self.head(x)
	return x


	class VisionTransformer(TimmVisionTransformer):
	""" Vision Transformer with support for global average pooling
	"""

	def __init__(self, **kwargs):
	super(VisionTransformer, self).__init__(**kwargs)
	norm_layer = kwargs['norm_layer']
	embed_dim = kwargs['embed_dim']
	self.fc_norm = norm_layer(embed_dim)
	del self.norm # remove the original norm

	def interpolate_pos_encoding(self, x, embed):
	new_patches = x.shape[1]
	old_patches = embed.shape[1]

	w = 8
	h = int(new_patches / w)
	if new_patches == old_patches:
	return embed

	dim = x.shape[-1]
	pos_embed = nn.functional.interpolate(
	embed.reshape(1, 64, 8, dim).permute(0, 3, 1, 2),
	size=(h, w),
	mode='bicubic',
	)
	pos_embed = pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
	return pos_embed

	def forward(self, x):
	B = x.shape[0]
	x = self.patch_embed(x)

	x = x + self.interpolate_pos_encoding(x, self.pos_embed[:, 1:, :])

	cls_token = self.cls_token + self.pos_embed[:, :1, :]
	cls_tokens = cls_token.expand(B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
	x = torch.cat((cls_tokens, x), dim=1)
	x = self.pos_drop(x)

	for blk in self.blocks:
	x = blk(x)

	# x = x[:, 1:, :].mean(dim=1) # global pool without cls token
	# outcome = self.fc_norm(x)

	return x[:, 1:, :].reshape(B, -1, 8, 768).permute(0, 3, 2, 1), x[:, 0]


	class NewPatchEmbed(nn.Module):
	""" Flexible Image to Patch Embedding
	"""

	def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, stride=10):
	super().__init__()
	img_size = to_2tuple(img_size)
	patch_size = to_2tuple(patch_size)
	stride = to_2tuple(stride)
	self.img_size = img_size
	self.patch_size = patch_size
	self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride) # with overlapped patches
	_, _, h, w = self.get_output_shape(img_size) # n, emb_dim, h, w
	self.patch_hw = (h, w)
	self.num_patches = h * w

	def get_output_shape(self, img_size):
	# todo: don't be lazy..
	return self.proj(torch.randn(1, 1, img_size[0], img_size[1])).shape

	def forward(self, x):
	x = self.proj(x)
	x = x.flatten(2).transpose(1, 2)
	return x


	def pca(image_feats_list, dim=3, fit_pca=None):
	from sklearn.decomposition import PCA

	device = image_feats_list[0].device

	def flatten(tensor, target_size=None):
	if target_size is not None and fit_pca is None:
	F.interpolate(tensor, (target_size, target_size), mode="bilinear")
	B, C, H, W = tensor.shape
	return feats.permute(1, 0, 2, 3).reshape(C, B * H * W).permute(1, 0).detach().cpu()

	if len(image_feats_list) > 1 and fit_pca is None:
	target_size = image_feats_list[0].shape[2]
	else:
	target_size = None

	flattened_feats = []
	for feats in image_feats_list:
	flattened_feats.append(flatten(feats, target_size))
	x = torch.cat(flattened_feats, dim=0)

	if fit_pca is None:
	fit_pca = PCA(n_components=dim, svd_solver="arpack").fit(np.nan_to_num(x.detach().numpy()))

	reduced_feats = []
	for feats in image_feats_list:
	x_red = torch.from_numpy(fit_pca.transform(flatten(feats)))
	x_red -= x_red.min(dim=0, keepdim=True).values
	x_red /= x_red.max(dim=0, keepdim=True).values
	B, C, H, W = feats.shape
	reduced_feats.append(x_red.reshape(B, H, W, dim).permute(0, 3, 1, 2).to(device))

	return reduced_feats, fit_pca


	class AudiosetDataset(Dataset):
	def __init__(self, audio_conf):
	self.audio_conf = audio_conf
	self.melbins = self.audio_conf.get('num_mel_bins')
	self.dataset = self.audio_conf.get('dataset')
	self.norm_mean = self.audio_conf.get('mean')
	self.norm_std = self.audio_conf.get('std')

	print('Dataset: {}, mean {:.3f} and std {:.3f}'.format(self.dataset, self.norm_mean, self.norm_std))
	print(f'size of dataset {self.__len__()}')

	def _wav2fbank(self, filename):
	sample_rate = 16000
	target_length = 10
	waveform, obs_sr = torchaudio.load(filename)
	waveform = waveform[0]
	if obs_sr != sample_rate:
	waveform = resample(waveform, obs_sr, sample_rate)

	original_length = waveform.shape[0]
	padding = target_length * sample_rate - original_length

	if padding > 0:
	m = torch.nn.ZeroPad2d((0, padding))
	waveform = m(waveform)
	else:
	waveform = waveform[:target_length * sample_rate]


	waveform = waveform - waveform.mean()

	# 498 128, 998, 128
	fbank = torchaudio.compliance.kaldi.fbank(
	waveform.unsqueeze(0),
	htk_compat=True,
	sample_frequency=sample_rate,
	use_energy=False,
	window_type='hanning',
	num_mel_bins=128,
	dither=0.0,
	frame_shift=10)

	normed_fbank = (fbank - self.norm_mean) / (self.norm_std * 2)

	return normed_fbank

	def __getitem__(self, index):
	datum = {"wav": "../../samples/example.wav"}
	fbank = self._wav2fbank(datum['wav'])
	fbank = fbank.transpose(0, 1).unsqueeze(0) # 1, 128, 1024 (...,freq,time)
	fbank = torch.transpose(fbank.squeeze(), 0, 1) # time, freq
	# the output fbank shape is [time_frame_num, frequency_bins], e.g., [1024, 128]
	return fbank.unsqueeze(0)

	def __len__(self):
	return 1


	class AudioMAE(nn.Module):

	def __init__(self, output_path, finetuned):
	super().__init__()
	# build model
	model = VisionTransformer(
	patch_size=16,
	embed_dim=768,
	depth=12,
	num_heads=12,
	mlp_ratio=4,
	qkv_bias=True,
	norm_layer=partial(nn.LayerNorm, eps=1e-6),
	num_classes=527,
	drop_path_rate=0.1)

	img_size = (1024, 128) # 1024, 128
	emb_dim = 768
	model.patch_embed = NewPatchEmbed(
	img_size=img_size, patch_size=(16, 16), in_chans=1, embed_dim=emb_dim, stride=16)
	num_patches = model.patch_embed.num_patches
	model.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, emb_dim), requires_grad=False)

	if finetuned:
	fn = "audiomae_finetuned.pth"
	else:
	fn = "audiomae.pth"

	checkpoint = torch.load(os.path.join(output_path, 'models', fn), map_location='cpu')

	checkpoint_model = checkpoint['model']
	state_dict = model.state_dict()
	for k in ['head.weight', 'head.bias']:
	if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape:
	print(f"Removing key {k} from pretrained checkpoint")
	del checkpoint_model[k]
	msg = model.load_state_dict(checkpoint_model, strict=False)
	print(msg)

	model = model.eval()
	self.model = model
	self.config = dict(output_path=output_path, finetuned=finetuned)

	def forward(self, audio, include_cls):
	patch_tokens, cls_token = self.model(audio)

	if include_cls:
	return patch_tokens, cls_token
	else:
	return patch_tokens


	if __name__ == '__main__':
	import os

	device = torch.device("cuda:2")

	torch.manual_seed(0)
	np.random.seed(0)

	model = AudioMAE("../../", True).to(device)

	audio_conf_val = {
	'num_mel_bins': 128,
	'target_length': 1024,
	'dataset': "audioset",
	'mode': 'val',
	'mean': -4.2677393,
	'std': 4.5689974,
	}

	dataset = AudiosetDataset(audio_conf=audio_conf_val)

	batch = dataset[0].unsqueeze(0).to(device)

	embeddings = model(batch, include_cls=False)

	import matplotlib.pyplot as plt

	with torch.no_grad():
	[pca_feats], _ = pca([embeddings])
	plt.imshow(pca_feats.cpu().squeeze(0).permute(1, 2, 0))
	plt.show()
	print("here")

	print("here")