|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Configuration class for Cosmos-Embed1.""" |
|
|
|
from typing import Any, Literal, Tuple, Union |
|
|
|
from transformers import AutoConfig, PretrainedConfig |
|
|
|
|
|
class CosmosEmbed1Config(PretrainedConfig): |
|
model_type = "cosmos-embed1" |
|
|
|
def __init__( |
|
self, |
|
embed_dim: int = 256, |
|
num_query_tokens: int = 32, |
|
max_txt_len: int = 128, |
|
num_video_frames: int = 8, |
|
temporal_encoding_type: Literal[ |
|
"neighboring_token_propagation", "temporal_parameter" |
|
] = "neighboring_token_propagation", |
|
resolution: Union[int, Tuple[int, int]] = 224, |
|
vocab_size: int = 30523, |
|
transformer_engine: bool = False, |
|
use_fp8: bool = False, |
|
**kwargs: Any, |
|
) -> None: |
|
"""Configuration for `CosmosEmbed1Config`. |
|
|
|
Args: |
|
embed_dim (int): the dimension of extracted text-visual embeddings. |
|
num_query_tokens (int): number of learnable query tokens. |
|
max_txt_len (int): max length of text token sequences before truncation. |
|
num_video_frames (int): number of input video frames. |
|
temporal_encoding_type (str): temporal encoding module type. |
|
resolution (Union[int, Tuple[int, int]]): input video frame resolution. |
|
Can be an integer for square images (height=width) or a tuple of (height, width) for non-square. |
|
vocab_size (int): vocab size for text tokenizer. |
|
The default is from `bert-base-uncased` with an extra [DEC] token. |
|
transformer_engine (bool): whether to use TransformerEngine for acceleration. |
|
use_fp8 (bool): whether to use FP8 precision (requires transformer_engine=True). |
|
""" |
|
super().__init__(**kwargs) |
|
|
|
self.embed_dim = embed_dim |
|
self.num_query_tokens = num_query_tokens |
|
self.max_txt_len = max_txt_len |
|
self.num_video_frames = num_video_frames |
|
self.temporal_encoding_type = temporal_encoding_type |
|
self.resolution = resolution |
|
self.vocab_size = vocab_size |
|
self.transformer_engine = transformer_engine |
|
self.use_fp8 = use_fp8 |
|
|
|
|
|
AutoConfig.register("cosmos-embed1", CosmosEmbed1Config) |
|
|