tencent
/

SongGeneration

Model card Files Files and versions

SongGeneration / third_party /stable_audio_tools /stable_audio_tools /data /utils.py

waytan22's picture

Upload folder using huggingface_hub

e730386 verified 2 months ago

3.24 kB

	import math
	import random
	import torch

	from torch import nn
	from typing import Tuple

	class PadCrop(nn.Module):
	def __init__(self, n_samples, randomize=True):
	super().__init__()
	self.n_samples = n_samples
	self.randomize = randomize

	def __call__(self, signal):
	n, s = signal.shape
	start = 0 if (not self.randomize) else torch.randint(0, max(0, s - self.n_samples) + 1, []).item()
	end = start + self.n_samples
	output = signal.new_zeros([n, self.n_samples])
	output[:, :min(s, self.n_samples)] = signal[:, start:end]
	return output

	class PadCrop_Normalized_T(nn.Module):

	def __init__(self, n_samples: int, sample_rate: int, randomize: bool = True):

	super().__init__()

	self.n_samples = n_samples
	self.sample_rate = sample_rate
	self.randomize = randomize

	def __call__(self, source: torch.Tensor) -> Tuple[torch.Tensor, float, float, int, int]:

	n_channels, n_samples = source.shape

	# If the audio is shorter than the desired length, pad it
	upper_bound = max(0, n_samples - self.n_samples)

	# If randomize is False, always start at the beginning of the audio
	offset = 0
	if(self.randomize and n_samples > self.n_samples):
	offset = random.randint(0, upper_bound)

	# Calculate the start and end times of the chunk
	t_start = offset / (upper_bound + self.n_samples)
	t_end = (offset + self.n_samples) / (upper_bound + self.n_samples)

	# Create the chunk
	chunk = source.new_zeros([n_channels, self.n_samples])

	# Copy the audio into the chunk
	chunk[:, :min(n_samples, self.n_samples)] = source[:, offset:offset + self.n_samples]

	# Calculate the start and end times of the chunk in seconds
	seconds_start = math.floor(offset / self.sample_rate)
	seconds_total = math.ceil(n_samples / self.sample_rate)

	# Create a mask the same length as the chunk with 1s where the audio is and 0s where it isn't
	padding_mask = torch.zeros([self.n_samples])
	padding_mask[:min(n_samples, self.n_samples)] = 1


	return (
	chunk,
	t_start,
	t_end,
	seconds_start,
	seconds_total,
	padding_mask
	)

	class PhaseFlipper(nn.Module):
	"Randomly invert the phase of a signal"
	def __init__(self, p=0.5):
	super().__init__()
	self.p = p
	def __call__(self, signal):
	return -signal if (random.random() < self.p) else signal

	class Mono(nn.Module):
	def __call__(self, signal):
	return torch.mean(signal, dim=0, keepdims=True) if len(signal.shape) > 1 else signal

	class Stereo(nn.Module):
	def __call__(self, signal):
	signal_shape = signal.shape
	# Check if it's mono
	if len(signal_shape) == 1: # s -> 2, s
	signal = signal.unsqueeze(0).repeat(2, 1)
	elif len(signal_shape) == 2:
	if signal_shape[0] == 1: #1, s -> 2, s
	signal = signal.repeat(2, 1)
	elif signal_shape[0] > 2: #?, s -> 2,s
	signal = signal[:2, :]

	return signal