Spaces:

csuhan
/

Tar

Runtime error

App Files Files Community

Tar / model /LLM /onellm.py

csuhan

Upload folder using huggingface_hub

1a84a43 almost 2 years ago

raw

history blame

19.2 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# This software may be used and distributed according to the terms of the GNU General Public License version 3.

	from typing import Optional, Tuple
	from dataclasses import dataclass
	import math
	import functools
	import copy

	import torch
	from torch import nn
	import torch.nn.functional as F

	import fairscale.nn.model_parallel.initialize as fs_init
	from fairscale.nn.model_parallel.layers import (
	ParallelEmbedding,
	RowParallelLinear,
	ColumnParallelLinear,
	)
	from ..components import RMSNorm
	from flash_attn import flash_attn_func

	import open_clip


	default_linear_init = nn.init.xavier_uniform_


	@dataclass
	class ModelArgs:
	dim: int = 512
	n_layers: int = 8
	n_heads: int = 8
	vocab_size: int = -1 # defined later by tokenizer
	multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2
	norm_eps: float = 1e-5

	max_batch_size: int = 32
	max_seq_len: int = 2048


	def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
	freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)
	[: (dim // 2)].float() / dim))
	t = torch.arange(end, device=freqs.device) # type: ignore
	freqs = torch.outer(t, freqs).float() # type: ignore
	freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64
	return freqs_cis


	def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
	ndim = x.ndim
	assert 0 <= 1 < ndim
	assert freqs_cis.shape == (x.shape[1], x.shape[-1])
	shape = [d if i == 1 or i == ndim -
	1 else 1 for i, d in enumerate(x.shape)]
	return freqs_cis.view(*shape)


	def apply_rotary_emb(
	xq: torch.Tensor,
	xk: torch.Tensor,
	freqs_cis: torch.Tensor,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
	xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
	freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
	xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
	xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
	return xq_out.type_as(xq), xk_out.type_as(xk)


	class Attention(nn.Module):
	def __init__(self, args: ModelArgs):
	super().__init__()

	self.n_local_heads = args.n_heads // fs_init.get_model_parallel_world_size()
	self.head_dim = args.dim // args.n_heads

	self.wq = ColumnParallelLinear(
	args.dim,
	args.n_heads * self.head_dim,
	bias=False,
	gather_output=False,
	init_method=default_linear_init,
	)
	self.wk = ColumnParallelLinear(
	args.dim,
	args.n_heads * self.head_dim,
	bias=False,
	gather_output=False,
	init_method=default_linear_init,
	)
	self.wv = ColumnParallelLinear(
	args.dim,
	args.n_heads * self.head_dim,
	bias=False,
	gather_output=False,
	init_method=default_linear_init,
	)
	self.wo = RowParallelLinear(
	args.n_heads * self.head_dim,
	args.dim,
	bias=False,
	input_is_parallel=True,
	init_method=default_linear_init,
	)

	self.flash = True
	self.k_cache, self.v_cache = None, None

	def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor], prompt=None):
	bsz, seqlen, _ = x.shape
	xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)

	xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
	xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim)
	xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim)

	if freqs_cis is not None:
	xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)

	if self.k_cache is None or self.v_cache is None:
	keys, values = xk, xv
	else:
	self.k_cache = self.k_cache.to(xk)
	self.v_cache = self.v_cache.to(xv)
	self.k_cache[:bsz, start_pos: start_pos + seqlen, :, :] = xk
	self.v_cache[:bsz, start_pos: start_pos + seqlen, :, :] = xv
	keys = self.k_cache[:bsz, :start_pos + seqlen]
	values = self.v_cache[:bsz, :start_pos + seqlen]

	output = flash_attn_func(
	xq, keys, values, dropout_p=0.0, causal=mask is not None)
	output = output.contiguous().view(bsz, seqlen, -1)

	return self.wo(output)

	def allocate_kv_cache(self, max_batch_size: int, max_seq_len: int) -> None:
	kv_cache_shape = (max_batch_size, max_seq_len,
	self.n_local_heads, self.head_dim)
	if self.k_cache is None or self.k_cache.size() != kv_cache_shape:
	self.k_cache = torch.empty(kv_cache_shape)
	if self.v_cache is None or self.v_cache.size() != kv_cache_shape:
	self.v_cache = torch.empty(kv_cache_shape)

	def destroy_kv_cache(self) -> None:
	self.k_cache, self.v_cache = None, None


	class FeedForward(nn.Module):
	def __init__(
	self,
	dim: int,
	hidden_dim: int,
	multiple_of: int,
	):
	super().__init__()
	hidden_dim = int(2 * hidden_dim / 3)
	hidden_dim = multiple_of * \
	((hidden_dim + multiple_of - 1) // multiple_of)

	self.w1 = ColumnParallelLinear(
	dim, hidden_dim, bias=False, gather_output=False, init_method=default_linear_init,
	)
	self.w2 = RowParallelLinear(
	hidden_dim, dim, bias=False, input_is_parallel=True, init_method=default_linear_init
	)
	self.w3 = ColumnParallelLinear(
	dim, hidden_dim, bias=False, gather_output=False, init_method=default_linear_init
	)

	def _silu_gating(self, x, y):
	return F.silu(x) * y

	def forward(self, x):
	return self.w2(self._silu_gating(self.w1(x), self.w3(x)))


	class TransformerBlock(nn.Module):
	def __init__(self, layer_id: int, args: ModelArgs):
	super().__init__()
	self.n_heads = args.n_heads
	self.dim = args.dim
	self.head_dim = args.dim // args.n_heads
	self.attention = Attention(args)
	self.feed_forward = FeedForward(
	dim=args.dim, hidden_dim=4 * args.dim, multiple_of=args.multiple_of
	)
	self.layer_id = layer_id
	self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
	self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)

	def _forward_ffn(self, h):
	return h + self.feed_forward(self.ffn_norm(h))

	def _forward_attention(self, x, start_pos, freqs_cis, mask, prompt):
	return x + self.attention.forward(self.attention_norm(x), start_pos, freqs_cis, mask, prompt)

	def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor], prompt=None):
	h = self._forward_attention(x, start_pos, freqs_cis, mask, prompt)
	out = self._forward_ffn(h)
	return out


	class Mlp(nn.Module):
	""" MLP as used in Vision Transformer, MLP-Mixer and related networks
	"""

	def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU):
	super().__init__()
	out_features = out_features or in_features
	hidden_features = hidden_features or in_features

	self.fc1 = nn.Linear(in_features, hidden_features)
	self.act = act_layer()
	self.fc2 = nn.Linear(hidden_features, out_features)

	def forward(self, x):
	x = self.fc1(x)
	x = self.act(x)
	x = self.fc2(x)
	return x


	class Transformer(nn.Module):
	def __init__(self, params: ModelArgs):
	super().__init__()
	self.params = params
	self.vocab_size = params.vocab_size
	self.n_layers = params.n_layers
	self.tok_embeddings = ParallelEmbedding(
	params.vocab_size, params.dim, init_method=nn.init.normal_,
	)

	self.layers = torch.nn.ModuleList()
	for layer_id in range(params.n_layers):
	self.layers.append(TransformerBlock(layer_id, params))

	self.norm = RMSNorm(params.dim, eps=params.norm_eps)
	self.output = ColumnParallelLinear(
	params.dim, params.vocab_size, bias=False, init_method=default_linear_init,
	)

	self.freqs_cis = precompute_freqs_cis(
	self.params.dim // self.params.n_heads, self.params.max_seq_len * 2
	)

	# load clip
	self.clip, _, _ = open_clip.create_model_and_transforms(
	'ViT-L-14', pretrained='openai')
	for param in self.clip.parameters():
	param.requires_grad = False
	param.data = param.data.half()
	self.clip.transformer = None

	self.image_words = 30
	self.cache_image_words = 0 # for inference

	clip_width = self.clip.visual.conv1.out_channels
	# create modal shared modules
	self.resample_layers = nn.ModuleDict()
	self.num_experts = 3
	self.num_resample_layers = 8
	for expert in range(self.num_experts):
	expert = str(expert)
	self.resample_layers[expert] = nn.ModuleList()
	resampler_params = copy.deepcopy(params)
	resampler_params.n_heads = 16
	resampler_params.dim = clip_width
	for layer_id in range(self.num_resample_layers):
	self.resample_layers[expert].append(
	TransformerBlock(layer_id, resampler_params))

	self.conv1 = nn.ModuleDict()
	self.positional_embedding = nn.ParameterDict()
	self.resample_tokens = nn.ParameterDict()
	self.clip_proj1 = nn.ModuleDict()
	self.clip_proj2 = nn.ModuleDict()
	self.routers = nn.ModuleDict()
	self.start_tag = nn.ParameterDict()
	self.end_tag = nn.ParameterDict()
	self.modals = ['image', 'audio', 'point', 'video', 'rgbd', 'rgbn', 'fmri', 'imu']
	for modal in self.modals:
	if modal in ['image', 'video', 'rgbn', 'rgbn']:
	modal_tokens = 256 + 1
	pass
	elif modal == 'audio':
	self.conv1[modal] = nn.Conv2d(
	1, clip_width, kernel_size=(16, 16), stride=(10, 10))
	modal_tokens = 1212 + 1
	self.positional_embedding[modal] = nn.Parameter(
	torch.empty([modal_tokens, clip_width]))
	nn.init.normal_(self.positional_embedding[modal], std=0.02)
	elif modal == 'point':
	from model.lib.point_utils import PointPatchEmbed
	self.conv1[modal] = PointPatchEmbed(
	in_channels=6, channels=clip_width)
	modal_tokens = 1024 + 1
	self.positional_embedding[modal] = nn.Parameter(
	torch.empty([modal_tokens, clip_width]))
	nn.init.normal_(self.positional_embedding[modal], std=0.02)
	elif modal == 'fmri':
	self.conv1[modal] = nn.Linear(15724, 8192)
	self.positional_embedding[modal] = nn.Parameter(
	torch.empty([8+1, clip_width]))
	nn.init.normal_(self.positional_embedding[modal], std=0.02)
	elif modal == 'imu':
	self.conv1[modal] = nn.Conv1d(
	in_channels=6, out_channels=clip_width, kernel_size=10, bias=False)
	self.positional_embedding[modal] = nn.Parameter(
	torch.empty([391+1, clip_width]))
	nn.init.normal_(self.positional_embedding[modal], std=0.02)

	self.routers[modal] = Mlp(
	clip_width, clip_width * 4, self.num_experts)

	self.resample_tokens[modal] = nn.Parameter(
	torch.empty([1, 30, resampler_params.dim]))
	nn.init.normal_(self.resample_tokens[modal], std=0.02)

	self.clip_proj1[modal] = nn.Sequential(
	nn.Linear(clip_width, resampler_params.dim),
	nn.LayerNorm(resampler_params.dim))

	self.clip_proj2[modal] = nn.Sequential(
	nn.Linear(resampler_params.dim, params.dim),
	nn.LayerNorm(params.dim))

	self.start_tag[modal] = nn.Parameter(torch.rand(1, 1, params.dim))
	self.end_tag[modal] = nn.Parameter(torch.rand(1, 1, params.dim))

	# @torch.no_grad()

	def clip_encode_image(self, x, modal='image'):
	# shape = [, width, grid * 2]
	x = x.reshape(x.shape[0], x.shape[1], -1)
	x = x.permute(0, 2, 1) # shape = [, grid * 2, width]

	x = torch.cat([self.clip.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1,
	x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [, grid * 2 + 1, width]

	# use pretrained pos embeding for rest modalities
	pos_embedding = self.clip.visual.positional_embedding
	if modal in ['audio', 'point', 'fmri', 'imu']:
	pos_embedding = self.positional_embedding[modal]

	x = x + pos_embedding.to(x.dtype)
	x = self.clip.visual.ln_pre(x)

	x = x.permute(1, 0, 2) # NLD -> LND
	x = self.clip.visual.transformer(x)
	x = x.permute(1, 0, 2) # LND -> NLD

	# preserve all spatial tokens
	x = self.clip.visual.ln_post(x[:, :, :])

	# if self.clip.visual.proj is not None:
	# x = x @ self.clip.visual.proj

	return x

	def encode_image(self, x, modal='image'):
	bsz = x.size(0)
	T = 1
	if modal in ['image']:
	# modified from CLIP
	x = self.clip.visual.conv1(x) # shape = [*, width, grid, grid]
	elif modal in ['audio', 'imu']:
	x = self.conv1[modal](x)
	elif modal == 'point':
	# [B, 16384, 6] -> [B, 1024, 1024, 1]
	x = self.conv1[modal](x.float()).to(x.dtype)
	elif modal in ['video', 'rgbd', 'rgbn']:
	# [B, 15, 3, 224, 224]
	B, T = x.shape[:2]
	bsz = B * T
	x = x.reshape(bsz, *x.shape[2:])
	x = self.clip.visual.conv1(x)
	elif modal == 'fmri':
	x = self.conv1[modal](x)
	# [B, 1, 8196] -> [B, 1024, 8]
	x = x.reshape(x.size(0), self.clip.visual.conv1.out_channels, -1)

	image_feats = self.clip_encode_image(x, modal=modal)
	# take mean on time dimension
	# all inputs are reduced to [B, L, D]
	bsz = int(bsz / T)
	image_feats = image_feats.reshape(
	bsz, T, *image_feats.shape[1:]).mean(dim=1)

	image_feats = self.clip_proj1[modal](image_feats)
	image_feats = torch.cat(
	[self.resample_tokens[modal].repeat(bsz, 1, 1), image_feats], dim=1)

	# routing modalites
	# [B, L, D]->[B, L, N]
	routing_weights = self.routers[modal](image_feats).sigmoid()
	routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)

	image_feats_experts = []
	for expert_id in range(self.num_experts):
	image_feats_expert = image_feats
	for layer in self.resample_layers[str(expert_id)]:
	image_feats_expert = layer(image_feats_expert, 0, None, None)

	image_feats_expert = image_feats_expert[:, :self.resample_tokens[modal].size(1)]
	routing_weight = routing_weights[:, :self.resample_tokens[modal].size(
	1), expert_id]
	# [B, L, D] * [B, L, 1]
	image_feats_expert = image_feats_expert * routing_weight[:, :, None]

	image_feats_experts.append(image_feats_expert)

	image_feats = sum(image_feats_experts)
	image_feats = self.clip_proj2[modal](image_feats)

	return image_feats

	def forward(self, examples, image=None, modal='image'):
	self._destroy_kv_cache() # training always disables kv cache
	modal = modal[0]
	_bsz, seqlen = examples.shape
	h = self.tok_embeddings(examples)
	self.freqs_cis = self.freqs_cis.to(h.device)

	start_pos = 0
	prefix_len = 0
	if image is not None:
	h_bos, h_caption = h[:, :1], h[:, 1:]
	image_tokens = self.encode_image(image, modal)
	h = torch.cat((h_bos, self.start_tag[modal].expand(
	_bsz, -1, -1), image_tokens, self.end_tag[modal].expand(_bsz, -1, -1), h_caption), dim=1)
	# bos + image token + start_tag[modal], end_tag[modal] is used for caption generation
	prefix_len = image_tokens.shape[1] + 1 + 1
	seqlen = h.shape[1]

	freqs_cis = self.freqs_cis[start_pos:start_pos + seqlen]
	mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=h.device)
	mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h)
	for layer in self.layers:
	h = layer(h, start_pos, freqs_cis, mask)
	h = self.norm(h)
	output = self.output(h[:, prefix_len:, :])
	return output

	@torch.inference_mode()
	def forward_inference(self, tokens: torch.Tensor, start_pos: int, image=None, modal='image'):
	modal = modal[0] if isinstance(modal, list) else modal
	_bsz, seqlen = tokens.shape
	if start_pos == 0:
	# kv cache will not re-allocate if size is unchanged
	self._allocate_kv_cache(_bsz)
	h = self.tok_embeddings(tokens)
	self.freqs_cis = self.freqs_cis.to(h.device)

	if image is not None:
	h_bos, h_caption = h[:, :1], h[:, 1:]
	image_tokens = self.encode_image(image, modal)
	self.cache_image_words = image_tokens.shape[1]
	h = torch.cat((h_bos, self.start_tag[modal].repeat(_bsz, 1, 1), image_tokens, self.end_tag[modal].repeat(_bsz, 1, 1), h_caption), dim=1)
	seqlen = h.shape[1]
	freqs_cis = self.freqs_cis[0: seqlen]
	else:
	if start_pos == 0:
	self.cache_image_words = 0
	freqs_cis = self.freqs_cis[0: seqlen]
	else:
	# if image was not None when start_pos=0,
	# the offset should be added to start_pos within later forward_inference calls
	start_pos = start_pos + self.cache_image_words
	freqs_cis = self.freqs_cis[start_pos: start_pos + seqlen]

	# freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]

	mask = None
	if seqlen > 1:
	mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=tokens.device)
	mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h)

	for layer in self.layers:
	h = layer(h, start_pos, freqs_cis, mask)
	h = self.norm(h)
	output = self.output(h[:, -1, :]) # only compute last logits
	return output.float()

	def _allocate_kv_cache(self, max_batch_size: int) -> None:
	for layer in self.layers:
	layer.attention.allocate_kv_cache(
	max_batch_size, self.params.max_seq_len)

	def _destroy_kv_cache(self) -> None:
	for layer in self.layers:
	layer.attention.destroy_kv_cache()