Spaces:

LPDoctor
/

ThinkSound-Audio-App

Running on Zero

App Files Files Community

ThinkSound-Audio-App / ThinkSound /models /embeddings.py

LPDoctor

Add ThinkSound module files to repository

b5eac81 1 day ago

raw

history blame contribute delete

3.08 kB

	import torch
	import torch.nn as nn

	# https://github.com/facebookresearch/DiT

	from typing import Union

	import torch
	from einops import rearrange
	from torch import Tensor

	# Ref: https://github.com/black-forest-labs/flux/blob/main/src/flux/math.py
	# Ref: https://github.com/lucidrains/rotary-embedding-torch


	def compute_rope_rotations(length: int,
	dim: int,
	theta: int,
	*,
	freq_scaling: float = 1.0,
	device: Union[torch.device, str] = 'cpu') -> Tensor:
	assert dim % 2 == 0

	with torch.amp.autocast(device_type='cuda', enabled=False):
	pos = torch.arange(length, dtype=torch.float32, device=device)
	freqs = 1.0 / (theta**(torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim))
	freqs *= freq_scaling

	rot = torch.einsum('..., f -> ... f', pos, freqs)
	rot = torch.stack([torch.cos(rot), -torch.sin(rot), torch.sin(rot), torch.cos(rot)], dim=-1)
	rot = rearrange(rot, 'n d (i j) -> 1 n d i j', i=2, j=2)
	return rot


	def apply_rope(x: Tensor, rot: Tensor) -> tuple[Tensor, Tensor]:
	with torch.amp.autocast(device_type='cuda', enabled=False):
	_x = x.float()
	_x = _x.view(*_x.shape[:-1], -1, 1, 2)
	x_out = rot[..., 0] * _x[..., 0] + rot[..., 1] * _x[..., 1]
	return x_out.reshape(*x.shape).to(dtype=x.dtype)


	class TimestepEmbedder(nn.Module):
	"""
	Embeds scalar timesteps into vector representations.
	"""

	def __init__(self, dim, frequency_embedding_size, max_period):
	super().__init__()
	self.mlp = nn.Sequential(
	nn.Linear(frequency_embedding_size, dim),
	nn.SiLU(),
	nn.Linear(dim, dim),
	)
	self.dim = dim
	self.max_period = max_period
	assert dim % 2 == 0, 'dim must be even.'

	with torch.autocast('cuda', enabled=False):
	self.register_buffer("freqs",
	1.0 / (10000**(torch.arange(0, frequency_embedding_size, 2, dtype=torch.float32) /
	frequency_embedding_size)),
	persistent=False)
	freq_scale = 10000 / max_period
	self.freqs = freq_scale * self.freqs

	def timestep_embedding(self, t):
	"""
	Create sinusoidal timestep embeddings.
	:param t: a 1-D Tensor of N indices, one per batch element.
	These may be fractional.
	:param dim: the dimension of the output.
	:param max_period: controls the minimum frequency of the embeddings.
	:return: an (N, D) Tensor of positional embeddings.
	"""
	# https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py

	args = t[:, None].float() * self.freqs[None]
	embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
	return embedding

	def forward(self, t):
	t_freq = self.timestep_embedding(t).to(t.dtype)
	t_emb = self.mlp(t_freq)
	return t_emb